org.carrot2.core.LanguageCode Java Examples

The following examples show how to use org.carrot2.core.LanguageCode. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: UsingCustomLanguageModel.java From scava with Eclipse Public License 2.0

6 votes

/**
 * Custom language model implementation. This one uses some contrived algorithms
 * and stop words just to demonstrate how they work.
 */
@Override
public IStemmer getStemmer(LanguageCode languageCode)
{
    // Here we always return the same language model, regardless of the requested
    // language. In your implementation you may want to return different models
    // based on the language, if needed.
    System.out.println("stemmer");
    return new IStemmer()
    {
        public CharSequence stem(CharSequence word)
        {
            // Some contrived stemming algorithm
            return word.length() > 3 ? word.subSequence(0, word.length() - 2)
                : null;
        }
    };
}

Example #2

Source File: DuplicatingTokenizerFactory.java From lucene-solr with Apache License 2.0

6 votes

@Override
public ITokenizer getTokenizer(LanguageCode language) {
  return new ITokenizer() {
    private final ExtendedWhitespaceTokenizer delegate = new ExtendedWhitespaceTokenizer();
    
    @Override
    public void setTermBuffer(MutableCharArray buffer) {
      delegate.setTermBuffer(buffer);
      buffer.reset(buffer.toString() + buffer.toString());
    }
    
    @Override
    public void reset(Reader input) {
      delegate.reset(input);
    }
    
    @Override
    public short nextToken() throws IOException {
      return delegate.nextToken();
    }
  };
}

Example #3

Source File: LexicalResourcesCheckClusteringAlgorithm.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void process() throws ProcessingException {
  clusters = new ArrayList<>();
  if (wordsToCheck == null) {
    return;
  }

  // Test with Maltese so that the English clustering performed in other tests
  // is not affected by the test stopwords and stoplabels.
  ILexicalData lexicalData = preprocessing.lexicalDataFactory
      .getLexicalData(LanguageCode.MALTESE);

  for (String word : wordsToCheck.split(",")) {
    if (!lexicalData.isCommonWord(new MutableCharArray(word))
        && !lexicalData.isStopLabel(word)) {
      clusters.add(new Cluster(word));
    }
  }
}

Example #4

Source File: LuceneCarrot2StemmerFactory.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Create and return an {@link IStemmer} adapter for a
 * {@link SnowballStemmer} for a given language code. An identity stemmer is
 * returned for unknown languages.
 */
public static IStemmer createStemmer(LanguageCode language) {
  final Class<? extends SnowballStemmer> stemmerClazz = snowballStemmerClasses
      .get(language);

  if (stemmerClazz == null) {
    log.warn("No Snowball stemmer class for: {}. "
        + "Quality of clustering may be degraded.", language.name());
    return IdentityStemmer.INSTANCE;
  }

  try {
    return new SnowballStemmerAdapter(stemmerClazz.getConstructor().newInstance());
  } catch (Exception e) {
    log.warn("Could not instantiate snowball stemmer for language: {}"
            + ". Quality of clustering may be degraded."
        , language.name(), e);

    return IdentityStemmer.INSTANCE;
  }
}

Example #5

Source File: LuceneCarrot2StemmerFactory.java From lucene-solr with Apache License 2.0

6 votes

@Override
public IStemmer getStemmer(LanguageCode language) {
  switch (language) {
  case ARABIC:
    return ArabicStemmerFactory.createStemmer();

  case CHINESE_SIMPLIFIED:
    return IdentityStemmer.INSTANCE;

  default:
    /*
     * For other languages, try to use snowball's stemming.
     */
    return SnowballStemmerFactory.createStemmer(language);
  }
}

Example #6

Source File: LuceneCarrot2TokenizerFactory.java From lucene-solr with Apache License 2.0

6 votes

@Override
public ITokenizer getTokenizer(LanguageCode language) {
  switch (language) {
  case CHINESE_SIMPLIFIED:
    return ChineseTokenizerFactory.createTokenizer();

    /*
     * We use our own analyzer for Arabic. Lucene's version has special
     * support for Nonspacing-Mark characters (see
     * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
     * have them included as letters in the parser.
     */
  case ARABIC:
    // Intentional fall-through.

  default:
    return new ExtendedWhitespaceTokenizer();
  }
}

Example #7

Source File: UsingCustomLanguageModel.java From scava with Eclipse Public License 2.0

6 votes

@Override
public ILexicalData getLexicalData(LanguageCode languageCode)
{
    // Here we always return the same language model, regardless of the requested
    // language. In your implementation you may want to return different models
    // based on the language, if needed.
    System.out.println("lexical data");
    return new ILexicalData()
    {
        @Override
        public boolean isStopLabel(CharSequence formattedLabel)
        {
            return formattedLabel.length() <= 4;
        }

        @Override
        public boolean isCommonWord(MutableCharArray word)
        {
            return STOP_WORDS.contains(word.toString());
        }
    };
}

Example #8

Source File: UsingCustomLanguageModel.java From scava with Eclipse Public License 2.0

5 votes

@Override
public ITokenizer getTokenizer(LanguageCode languageCode)
{
    // Here we always return the same language model, regardless of the requested
    // language. In your implementation you may want to return different models
    // based on the language, if needed.
    System.out.println("tokenizer");
    return new ExtendedWhitespaceTokenizer();
}

Example #9

Source File: DuplicatingStemmerFactory.java From lucene-solr with Apache License 2.0

5 votes

@Override
public IStemmer getStemmer(LanguageCode language) {
  return new IStemmer() {
    @Override
    public CharSequence stem(CharSequence word) {
      return word.toString() + word.toString();
    }
  };
}

Example #10

Source File: EchoTokensClusteringAlgorithm.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void process() throws ProcessingException {
  final PreprocessingContext preprocessingContext = preprocessing.preprocess(
      documents, "", LanguageCode.ENGLISH);
  clusters = new ArrayList<>();
  for (char[] token : preprocessingContext.allTokens.image) {
    if (token != null) {
      clusters.add(new Cluster(new String(token)));
    }
  }
}

Example #11

Source File: EchoStemsClusteringAlgorithm.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void process() throws ProcessingException {
  final PreprocessingContext preprocessingContext = preprocessing.preprocess(
      documents, "", LanguageCode.ENGLISH);
  final AllTokens allTokens = preprocessingContext.allTokens;
  final AllWords allWords = preprocessingContext.allWords;
  final AllStems allStems = preprocessingContext.allStems;
  clusters = new ArrayList<>();
  for (int i = 0; i < allTokens.image.length; i++) {
    if (allTokens.wordIndex[i] >= 0) {
      clusters.add(new Cluster(new String(
          allStems.image[allWords.stemIndex[allTokens.wordIndex[i]]])));
    }
  }
}

Example #12

Source File: CarrotClusteringEngineTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testOneCarrot2SupportedLanguage() throws Exception {
  final ModifiableSolrParams params = new ModifiableSolrParams();
  params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");

  final List<String> labels = getLabels(checkEngine(
      getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
          "one_supported_language")), params).get(0));
  assertEquals(3, labels.size());
  assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
}

Example #13

Source File: CarrotClusteringEngineTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testOneCarrot2SupportedLanguageOfMany() throws Exception {
  final ModifiableSolrParams params = new ModifiableSolrParams();
  params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
  
  final List<String> labels = getLabels(checkEngine(
      getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
          "one_supported_language_of_many")), params).get(0));
  assertEquals(3, labels.size());
  assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
}

Example #14

Source File: CarrotClusteringEngineTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testLanguageCodeMapping() throws Exception {
  final ModifiableSolrParams params = new ModifiableSolrParams();
  params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
  params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
  
  final List<String> labels = getLabels(checkEngine(
      getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
          "one_supported_language_of_many")), params).get(0));
  assertEquals(3, labels.size());
  assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
}

Example #15

Source File: Cluster.java From DistributedCrawler with Apache License 2.0

5 votes

/**
 * 对指定的PagePOJO进行聚类
 * 
 * @author GS
 * @param list
 *            PagePOJO List
 * @return ProcessingResult类,调用需要的方法即可.
 * @throws IOException
 * @throws Exception
 */
public Map<String,List<String>> cluster(List<PagePOJO> list) throws IOException,
		Exception {
	@SuppressWarnings("unchecked")
	final Controller controller = ControllerFactory
			.createCachingPooling(IDocumentSource.class);
	final List<Document> documents = Lists.newArrayList();
	Iterator<PagePOJO> it = list.iterator();
	while (it.hasNext()) {
		PagePOJO pojo = it.next();
		documents.add(new Document(pojo.getTitle(), pojo.getContent(),LanguageCode.CHINESE_SIMPLIFIED));
	}
	final Map<String, Object> attributes = Maps.newHashMap();
	CommonAttributesDescriptor.attributeBuilder(attributes).documents(
			documents);
	final ProcessingResult englishResult = controller.process(attributes,
			LingoClusteringAlgorithm.class);
	ConsoleFormatter.displayResults(englishResult);// 展示
	for (org.carrot2.core.Cluster c : englishResult.getClusters()) {
		LinkedList<String> value = new LinkedList<String>(); 
		for (Document d : c.getAllDocuments()) {
			value.add(d.getField(Document.TITLE).toString());
		}
		result.put(c.getLabel(), value);
	}
	return result;
}

Example #16

Source File: CommitsMessageTopicsTransMetricProvider.java From scava with Eclipse Public License 2.0

4 votes

private List<Cluster> produceCommitsMessagesTopics(CommitsMessageTopicsTransMetric db) {
	final ArrayList<Document> documents = new ArrayList<Document>();
	for (CommitMessage commitMessage : db.getCommitsMessages())
		documents.add(new Document(commitMessage.getSubject(), commitMessage.getMessage(), "", LanguageCode.ENGLISH, produceUID(commitMessage)));
	return produceTopics(documents);
}

Example #17

Source File: TopicsTransMetricProvider.java From scava with Eclipse Public License 2.0

4 votes

private List<Cluster> produceBugTrackerTopics(TopicsTransMetric db) {
	final ArrayList<Document> documents = new ArrayList<Document>();
	for (BugTrackerCommentsData comment : db.getBugTrackerComments())
		documents.add(new Document(comment.getSubject(), comment.getText(), "", LanguageCode.ENGLISH, produceUID(comment)));
	return produceTopics(documents);
}

Example #18

Source File: TopicsTransMetricProvider.java From scava with Eclipse Public License 2.0

4 votes

private List<Cluster> produceNewsgroupTopics(TopicsTransMetric db) {
	final ArrayList<Document> documents = new ArrayList<Document>();
	for (NewsgroupArticlesData article : db.getNewsgroupArticles())
		documents.add(new Document(article.getSubject(), article.getText(), "", LanguageCode.ENGLISH, produceUID(article)));
	return produceTopics(documents);
}

Example #19

Source File: ClusteringNonEnglishContent.java From scava with Eclipse Public License 2.0

4 votes

@SuppressWarnings("unchecked")
public static void main(String [] args)
{
    // [[[start:clustering-non-english-content]]]
    /*
     * We use a Controller that reuse instances of Carrot2 processing components 
     * and caches results produced by document sources.
     */
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    /*
     * In the first call, we'll cluster a document list, setting the language for each
     * document separately.
     */
    final List<Document> documents = Lists.newArrayList();
    for (Document document : SampleDocumentData.DOCUMENTS_DATA_MINING)
    {
        documents.add(new Document(document.getTitle(), document.getSummary(),
            document.getContentUrl(), LanguageCode.ENGLISH));
    }

    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(documents);
    final ProcessingResult englishResult = controller.process(
        attributes, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(englishResult);

    /*
     * In the second call, we will fetch results for a Chinese query from Bing,
     * setting explicitly the Bing's specific language attribute. Based on that
     * attribute, the document source will set the appropriate language for each
     * document.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    Bing3WebDocumentSourceDescriptor.attributeBuilder(attributes)
        .market(MarketOption.CHINESE_CHINA);
    Bing3WebDocumentSourceDescriptor
        .attributeBuilder(attributes)
            .appid(BingKeyAccess.getKey()); // use your own ID here!

    final ProcessingResult chineseResult = controller.process(attributes,
        Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult);

    /*
     * In the third call, we will fetch results for the same Chinese query from
     * Google. As Google document source does not have its specific attribute for
     * setting the language, it will not set the documents' language for us. To make
     * sure the right lexical resources are used, we will need to set the
     * MultilingualClustering.defaultLanguage attribute to Chinese on our own.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    MultilingualClusteringDescriptor.attributeBuilder(attributes)
        .defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);

    final ProcessingResult chineseResult2 = controller.process(attributes,
        GoogleDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult2);
    // [[[end:clustering-non-english-content]]]
}