java source code of StopwordAnnotatorTest

package intoxicant.analytics.coreNlp;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.Pair;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.util.Version;
import org.junit.Before;

import java.util.List;
import java.util.Properties;
import java.util.Set;

import static org.junit.Assert.*;

/**
 * User: jconwell
 * Unit tests for StopwordAnnotator
 */
public class StopwordAnnotatorTest {

    /**
     * The standard Lucene stopword list is as follows:
             "a", "an", "and", "are", "as", "at", "be", "but", "by",
             "for", "if", "in", "into", "is", "it",
             "no", "not", "of", "on", "or", "such",
             "that", "the", "their", "then", "there", "these",
             "they", "this", "to", "was", "will", "with"
     */

    //sample text for tests
    private static final String example = "The history of NLP generally starts in the 1950s, although work can be found from earlier periods.";

    //adding a couple extra terms to standard lucene list to test against
    private static final String customStopWordList = "start,starts,period,periods,a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with";

    Properties props = new Properties();

    @Before
    public void before() {
        props.put("annotators", "tokenize, ssplit, stopword");
        props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
    }

    @org.junit.Test
    public void testRequirementsSatisfied() throws Exception {
        StopwordAnnotator sw = new StopwordAnnotator(StopwordAnnotator.ANNOTATOR_CLASS , props);
        assertTrue(sw.requirementsSatisfied().size() == 1);
        assertTrue(sw.requirementsSatisfied().contains(StopwordAnnotator.STOPWORD_REQUIREMENT));
    }

    @org.junit.Test
    public void testRequires() throws Exception {

        //Test that if lemmatization is not being checked, requirements only returns tokenize and ssplit
        StopwordAnnotator sw = new StopwordAnnotator(StopwordAnnotator.ANNOTATOR_CLASS, props);
        assertTrue(sw.requires().size() == 2);
        assertTrue(sw.requires().contains(StopwordAnnotator.TOKENIZE_REQUIREMENT));
        assertTrue(sw.requires().contains(StopwordAnnotator.SSPLIT_REQUIREMENT));

        //Test that is lemmatization is being checked, requirements returns tokenize, ssplit, pos, and lemms
        props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, stopword");
        props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
        props.setProperty(StopwordAnnotator.CHECK_LEMMA, "true");
        sw = new StopwordAnnotator(StopwordAnnotator.ANNOTATOR_CLASS, props);
        assertTrue(sw.requires().size() == 4);
        assertTrue(sw.requires().contains(StopwordAnnotator.TOKENIZE_REQUIREMENT));
        assertTrue(sw.requires().contains(StopwordAnnotator.SSPLIT_REQUIREMENT));
        assertTrue(sw.requires().contains(StopwordAnnotator.POS_REQUIREMENT));
        assertTrue(sw.requires().contains(StopwordAnnotator.LEMMA_REQUIREMENT));
    }

    @org.junit.Test
    public void testGetType() throws Exception {
        StopwordAnnotator sw = new StopwordAnnotator(StopwordAnnotator.ANNOTATOR_CLASS, props);
        assertTrue(sw.getType().equals(Pair.makePair(true, true).getClass()));
    }


    /**
     * *****The following unit tests demonstrate how to use the StopwordAnnotator in CoreNlp*****
     */


    /**
     * Test to validate that stopwords are properly annotated in the token list
     * @throws Exception
     */
    @org.junit.Test
    public void testLuceneStopwordList() throws Exception {
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, stopword");
        props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");

        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document = new Annotation(example);
        pipeline.annotate(document);
        List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

        //get the standard lucene stopword set
        Set<?> stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

        for (CoreLabel token : tokens) {

            //get the stopword annotation
            Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

            String word = token.word().toLowerCase();
            if (stopWords.contains(word)) {
                assertTrue(stopword.first());
            }
            else {
                assertFalse(stopword.first());
            }

            //not checking lemma, so always false
            assertFalse(stopword.second());
        }
    }

    /**
     * Test to validate that the custom stopword list words
     * @throws Exception
     */
    @org.junit.Test
    public void testCustomStopwordList() throws Exception {

        //setup coreNlp properties for stopwords. Note the custom stopword list property
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, stopword");
        props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
        props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);

        //get the custom stopword set
        Set<?> stopWords = StopwordAnnotator.getStopWordList(Version.LUCENE_36, customStopWordList, true);

        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document = new Annotation(example);
        pipeline.annotate(document);
        List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
        for (CoreLabel token : tokens) {

            //get the stopword annotation
            Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

            String word = token.word().toLowerCase();
            if (stopWords.contains(word)) {
                assertTrue(stopword.first());
            }
            else {
                assertFalse(stopword.first());
            }

            //not checking lemma, so always false
            assertFalse(stopword.second());
        }
    }

    /**
     * Test to validate that lemma values are checked against the (custom) stopword list
     *
     * NOTE: since we're loading the pos model into memory you'll need to set the VM memory size via '-Xms512m -Xmx1048m'
     * @throws Exception
     */
    @org.junit.Test
    public void testStopwordsWithLemma() throws Exception {

        //setup coreNlp properties for stopwords. Note the custom stopword list and check for lemma property
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, stopword");
        props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");
        props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);
        props.setProperty(StopwordAnnotator.CHECK_LEMMA, "true");

        //get the custom stopword set
        Set<?> stopWords = StopwordAnnotator.getStopWordList(Version.LUCENE_36, customStopWordList, true);

        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document = new Annotation(example);
        pipeline.annotate(document);
        List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
        for (CoreLabel token : tokens) {

            //get the stopword annotation
            Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

            String word = token.word().toLowerCase();
            if (stopWords.contains(word)) {
                assertTrue(stopword.first());
            }
            else {
                assertFalse(stopword.first());
            }

            String lemma = token.lemma().toLowerCase();
            if (stopWords.contains(lemma)) {
                assertTrue(stopword.first());
            }
            else {
                assertFalse(stopword.first());
            }
        }
    }
}