org.carrot2.clustering.lingo.LingoClusteringAlgorithm Java Examples

The following examples show how to use org.carrot2.clustering.lingo.LingoClusteringAlgorithm. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ClusteringActionIT.java    From elasticsearch-carrot2 with Apache License 2.0 6 votes vote down vote up
public void testAttributes() throws IOException {
    LingoClusteringAlgorithm algorithm = new LingoClusteringAlgorithm();
    algorithm.desiredClusterCount.set(5);

    Map<String, Object> extract = Attrs.extract(algorithm);
    Attrs.populate(algorithm, extract);

    ClusteringActionResponse result = new ClusteringActionRequestBuilder(client)
        .setQueryHint("data mining")
        .addSourceFieldMapping("title", LogicalField.TITLE)
        .addSourceFieldMapping("content", LogicalField.CONTENT)
        .addAttributes(Attrs.extract(algorithm))
        .setSearchRequest(
          client.prepareSearch()
                .setIndices(INDEX_TEST)
                .setSize(100)
                .setQuery(QueryBuilders.matchAllQuery())
                .setFetchSource(new String[] {"title", "content"}, null))
        .execute().actionGet();

    checkValid(result);
    checkJsonSerialization(result);

    Assertions.assertThat(result.getDocumentGroups().length)
        .isBetween(0, 5 + 1);
}
 
Example #2
Source File: UsingCustomLanguageModel.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args)
{
    @SuppressWarnings("unchecked")
    final Controller controller = ControllerFactory
        .createCachingPooling(IDocumentSource.class);

    // We will pass our custom language model element factories classes as a
    // initialization-time attributes. It is preferred to passing them as
    // processing-time attributes because the instances created at initialization
    // time will be reused for all further requests.
    Map<String, Object> attrs = Maps.newHashMap();
    BasicPreprocessingPipelineDescriptor.attributeBuilder(attrs)
        .stemmerFactory(CustomStemmerFactory.class)
        .tokenizerFactory(CustomTokenizerFactory.class)
        .lexicalDataFactory(CustomLexicalDataFactory.class);
    controller.init(attrs);

    // Cluster some data with Lingo and STC. Notice how the cluster quality degrades
    // when the stop word list is empty (especially for STC).
    clusterAndDisplayClusters(controller, LingoClusteringAlgorithm.class);
    clusterAndDisplayClusters(controller, STCClusteringAlgorithm.class);
}
 
Example #3
Source File: Cluster.java    From DistributedCrawler with Apache License 2.0 6 votes vote down vote up
/**
 * 对所有的PagePOJO进行聚类
 * 
 * @author GS
 * @return
 * @throws IOException
 * @throws Exception
 */
public ProcessingResult cluster(String docPath) throws IOException,
		Exception {
	@SuppressWarnings("unchecked")
	final Controller controller = ControllerFactory
			.createCachingPooling(IDocumentSource.class);
	final List<Document> documents = Lists.newArrayList();
	JsonReader jr = new JsonReader(new File(docPath));
	while (jr.hasNext()) {
		Hit h = jr.next();
		documents.add(new Document(h.getPagePOJO().getTitle(), h
				.getPagePOJO().getContent()));
	}
	jr.close();
	final Map<String, Object> attributes = Maps.newHashMap();
	CommonAttributesDescriptor.attributeBuilder(attributes).documents(
			documents);
	final ProcessingResult englishResult = controller.process(attributes,
			LingoClusteringAlgorithm.class);
	ConsoleFormatter.displayResults(englishResult);// 展示
	return englishResult;
}
 
Example #4
Source File: UsingCustomLexicalResources.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args)
{
    @SuppressWarnings("unchecked")
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    // We will pass our custom resource locator at initialization time. There is a
    // variety of implementations of IResourceLocator interface, we will use
    // an explicit filesystem folder in the current working directory.
    File resourcesDir = new File("resources");
    ResourceLookup resourceLookup = new ResourceLookup(new DirLocator(resourcesDir));

    Map<String, Object> attrs = Maps.newHashMap();

    // Note that we tell the linguistic component to merge all lexical resources,
    // this is the default setting and it usually helps with multi-lingual content.
    DefaultLexicalDataFactoryDescriptor.attributeBuilder(attrs)
        .mergeResources(true);
    LexicalDataLoaderDescriptor.attributeBuilder(attrs)
        .resourceLookup(resourceLookup);

    controller.init(attrs);

    // Cluster some data with Lingo and STC.
    clusterAndDisplayClusters(controller, LingoClusteringAlgorithm.class);
    clusterAndDisplayClusters(controller, STCClusteringAlgorithm.class);
}
 
Example #5
Source File: SavingResultsToXml.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args) throws Exception
{
    // Let's fetch some results from MSN first
    final Controller controller = ControllerFactory.createSimple();
    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(attributes,
        LingoClusteringAlgorithm.class);

    // Now, we can serialize the entire result to XML like this
    result.serialize(System.out);
    System.out.println();

    // Optionally, we can choose whether we want to serialize documents and clusters
    result.serialize(System.out, 
        false /* don't save documents */,
        true /* save clusters */);
}
 
Example #6
Source File: SavingResultsToJson.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
public static void main(String [] args) throws Exception
{
    // Let's fetch some results from MSN first
    final Controller controller = ControllerFactory.createSimple();
    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(new ArrayList<Document>(SampleDocumentData.DOCUMENTS_DATA_MINING))
        .query("data mining");

    final ProcessingResult result = controller.process(attributes,
        LingoClusteringAlgorithm.class);

    // Now, we can serialize the entire result to XML like this
    result.serializeJson(new PrintWriter(System.out));
    System.out.println();

    // Optionally, we can provide a callback for JSON-P-style calls
    result.serializeJson(
        new PrintWriter(System.out), "loadResults",
        true /* indent */, 
        false /* save documents */, 
        true /* save clusters */);

}
 
Example #7
Source File: ClusteringActionIT.java    From elasticsearch-carrot2 with Apache License 2.0 5 votes vote down vote up
public void testListAlgorithms() {
    ListAlgorithmsActionResponse response =
            new ListAlgorithmsActionRequestBuilder(client).get();

    List<String> algorithms = response.getAlgorithms();
    Assertions.assertThat(algorithms)
        .isNotEmpty()
        .contains(
            LingoClusteringAlgorithm.NAME,
            STCClusteringAlgorithm.NAME,
            BisectingKMeansClusteringAlgorithm.NAME);
}
 
Example #8
Source File: Cluster.java    From DistributedCrawler with Apache License 2.0 5 votes vote down vote up
/**
 * 对指定的PagePOJO进行聚类
 * 
 * @author GS
 * @param list
 *            PagePOJO List
 * @return ProcessingResult类,调用需要的方法即可.
 * @throws IOException
 * @throws Exception
 */
public Map<String,List<String>> cluster(List<PagePOJO> list) throws IOException,
		Exception {
	@SuppressWarnings("unchecked")
	final Controller controller = ControllerFactory
			.createCachingPooling(IDocumentSource.class);
	final List<Document> documents = Lists.newArrayList();
	Iterator<PagePOJO> it = list.iterator();
	while (it.hasNext()) {
		PagePOJO pojo = it.next();
		documents.add(new Document(pojo.getTitle(), pojo.getContent(),LanguageCode.CHINESE_SIMPLIFIED));
	}
	final Map<String, Object> attributes = Maps.newHashMap();
	CommonAttributesDescriptor.attributeBuilder(attributes).documents(
			documents);
	final ProcessingResult englishResult = controller.process(attributes,
			LingoClusteringAlgorithm.class);
	ConsoleFormatter.displayResults(englishResult);// 展示
	for (org.carrot2.core.Cluster c : englishResult.getClusters()) {
		LinkedList<String> value = new LinkedList<String>(); 
		for (Document d : c.getAllDocuments()) {
			value.add(d.getField(Document.TITLE).toString());
		}
		result.put(c.getLabel(), value);
	}
	return result;
}
 
Example #9
Source File: CarrotClusteringEngineTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testDeclarationEngineOrder() throws Exception {
  ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order");
  Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
  assertEquals(
      Arrays.asList("unavailable", "lingo", "stc", "mock", "default"),
      new ArrayList<>(engines.keySet()));
  assertEquals(
      LingoClusteringAlgorithm.class,
      ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
}
 
Example #10
Source File: CarrotClusteringEngineTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testDefaultEngineOrder() throws Exception {
  ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default");
  Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
  assertEquals(
      Arrays.asList("stc", "default", "mock"),
      new ArrayList<>(engines.keySet()));
  assertEquals(
      LingoClusteringAlgorithm.class,
      ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
}
 
Example #11
Source File: CommitsMessageTopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<Cluster> produceTopics(ArrayList<Document> documents) {
	/* A controller to manage the processing pipeline. */
	final Controller controller = ControllerFactory.createSimple();

	/*
	 * Perform clustering by topic using the Lingo algorithm. Lingo can take
	 * advantage of the original query, so we provide it along with the documents.
	 */
	final ProcessingResult byTopicClusters = controller.process(documents, null,
			LingoClusteringAlgorithm.class);
	final List<Cluster> clustersByTopic = byTopicClusters.getClusters();

	return clustersByTopic;
}
 
Example #12
Source File: TopicsTransMetricProvider.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<Cluster> produceTopics(ArrayList<Document> documents) {
	/* A controller to manage the processing pipeline. */
	final Controller controller = ControllerFactory.createSimple();

	/*
	 * Perform clustering by topic using the Lingo algorithm. Lingo can take
	 * advantage of the original query, so we provide it along with the documents.
	 */
	final ProcessingResult byTopicClusters = controller.process(documents, null,
			LingoClusteringAlgorithm.class);
	final List<Cluster> clustersByTopic = byTopicClusters.getClusters();

	return clustersByTopic;
}
 
Example #13
Source File: ClusteringDataFromPubMed.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
public static void main(String [] args)
{
    /*
     * For this example we use the SimpleController, which does not perform any
     * caching. If you need to cache documents, clusters or Carrot2 component
     * instances, please see the CachingController example.
     */
    final Controller controller = ControllerFactory.createSimple();
    Map<String, Object> attributes = new HashMap<String, Object>();

    /*
     * Search attributes.
     */
    CommonAttributesDescriptor
        .attributeBuilder(attributes)
        .query("heart")
        .results(100);

    /*
     * Optionally, you can also pass some attributes for the clustering algorithm. See
     * http://download.carrot2.org/head/manual/#section.component.lingo for a full
     * list.
     */
    AttributeBuilder builder = LingoClusteringAlgorithmDescriptor.attributeBuilder(attributes);
    builder.matrixReducer().factorizationFactory(LocalNonnegativeMatrixFactorizationFactory.class);
    builder.matrixBuilder().titleWordsBoost(7);

    ProcessingResult result = controller.process(attributes,
        PubMedDocumentSource.class, LingoClusteringAlgorithm.class);

    ConsoleFormatter.displayResults(result);
}
 
Example #14
Source File: LoadingAttributeValuesFromXml.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
public static void main(String [] args) throws Exception
{
    InputStream xmlStream = null;
    try
    {
        xmlStream = LoadingAttributeValuesFromXml.class
            .getResourceAsStream("algorithm-lingo-attributes.xml");

        // Load attribute value sets from the XML stream
        final AttributeValueSets attributeValueSets = AttributeValueSets
            .deserialize(xmlStream);

        // Get the desired set of attribute values for use with further processing
        final Map<String, Object> defaultAttributes = attributeValueSets
            .getDefaultAttributeValueSet().getAttributeValues();

        final Map<String, Object> fasterClusteringAttributes = attributeValueSets
            .getAttributeValueSet("faster-clustering").getAttributeValues();

        // Perform processing using the attribute values
        final Controller controller = ControllerFactory.createSimple();

        // Initialize the controller with one attribute set
        controller.init(fasterClusteringAttributes);

        // Perform clustering using the attribute set provided at initialization time
        Map<String, Object> requestAttributes = Maps.newHashMap(); 
        CommonAttributesDescriptor.attributeBuilder(requestAttributes)
            .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING))
            .query("data mining");
        ProcessingResult results = controller.process(requestAttributes, LingoClusteringAlgorithm.class);
        ConsoleFormatter.displayClusters(results.getClusters());

        // Perform clustering using some other attribute set, in this case the
        // one that is the default in the XML file.
        requestAttributes =
            CommonAttributesDescriptor.attributeBuilder(Maps.newHashMap(defaultAttributes))
                .documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING))
                .query("data mining").map;

        results = controller.process(requestAttributes, LingoClusteringAlgorithm.class);
        ConsoleFormatter.displayClusters(results.getClusters());
    }
    finally
    {
        CloseableUtils.close(xmlStream);
    }
}
 
Example #15
Source File: ClusteringDocumentList.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
public static void main(String [] args)
{
    /* [[[start:clustering-document-list-intro]]]
     * 
     * <div>
     * <p>
     * The easiest way to get started with Carrot2 is to cluster a collection
     * of {@link org.carrot2.core.Document}s. Each document can consist of:
     * </p>
     * 
     * <ul>
     * <li>document content: a query-in-context snippet, document abstract or full text,</li>
     * <li>document title: optional, some clustering algorithms give more weight to document titles,</li>
     * <li>document URL: optional, used by the {@link org.carrot2.clustering.synthetic.ByUrlClusteringAlgorithm}, 
     * ignored by other algorithms.</li>
     * </ul>
     * 
     * <p>
     * To make the example short, the code shown below clusters only 5 documents. Use
     * at least 20 to get reasonable clusters. If you have access to the query that generated
     * the documents being clustered, you should also provide it to Carrot2 to get better clusters.
     * </p>
     * </div>
     * 
     * [[[end:clustering-document-list-intro]]]
     */
    {
        // [[[start:clustering-document-list]]]
        /* A few example documents, normally you would need at least 20 for reasonable clusters. */
        final String [][] data = new String [] []
        {
            {
                "http://en.wikipedia.org/wiki/Data_mining",
                "Data mining - Wikipedia, the free encyclopedia",
                "Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."
            },

            {
                "http://www.ccsu.edu/datamining/resources.html",
                "CCSU - Data Mining",
                "A collection of Data Mining links edited by the Central Connecticut State University ... Graduate Certificate Program. Data Mining Resources. Resources. Groups ..."
            },

            {
                "http://www.kdnuggets.com/",
                "KDnuggets: Data Mining, Web Mining, and Knowledge Discovery",
                "Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."
            },

            {
                "http://en.wikipedia.org/wiki/Data-mining",
                "Data mining - Wikipedia, the free encyclopedia",
                "Data mining is considered a subfield within the Computer Science field of knowledge discovery. ... claim to perform \"data mining\" by automating the creation ..."
            },

            {
                "http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
                "Data Mining: What is Data Mining?",
                "Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."
            },
        };

        /* Prepare Carrot2 documents */
        final ArrayList<Document> documents = new ArrayList<Document>();
        for (String [] row : data)
        {
            documents.add(new Document(row[1], row[2], row[0]));
        }

        /* A controller to manage the processing pipeline. */
        final Controller controller = ControllerFactory.createSimple();

        /*
         * Perform clustering by topic using the Lingo algorithm. Lingo can 
         * take advantage of the original query, so we provide it along with the documents.
         */
        final ProcessingResult byTopicClusters = controller.process(documents, "data mining",
            LingoClusteringAlgorithm.class);
        final List<Cluster> clustersByTopic = byTopicClusters.getClusters();
        
        /* Perform clustering by domain. In this case query is not useful, hence it is null. */
        final ProcessingResult byDomainClusters = controller.process(documents, null,
            ByUrlClusteringAlgorithm.class);
        final List<Cluster> clustersByDomain = byDomainClusters.getClusters();
        // [[[end:clustering-document-list]]]
        
        ConsoleFormatter.displayClusters(clustersByTopic);
        ConsoleFormatter.displayClusters(clustersByDomain);
   }
}
 
Example #16
Source File: UsingCachingController.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
@SuppressWarnings(
{
    "unused", "unchecked"
})
public static void main(String [] args)
{
    // [[[start:using-caching-controller]]]
    /*
     * Create the caching controller. You need only one caching controller instance
     * per application life cycle. This controller instance will cache the results
     * fetched from any document source and also clusters generated by the Lingo
     * algorithm.
     */
    final Controller controller = ControllerFactory.createCachingPooling(
        IDocumentSource.class, LingoClusteringAlgorithm.class);

    /*
     * Before using the caching controller, you must initialize it. On initialization,
     * you can set default values for some attributes. In this example, we'll set the
     * default results number to 50 and the API key.
     */
    final Map<String, Object> globalAttributes = new HashMap<String, Object>();
    CommonAttributesDescriptor
        .attributeBuilder(globalAttributes)
            .results(50);
    Bing3WebDocumentSourceDescriptor
        .attributeBuilder(globalAttributes)
            .appid(BingKeyAccess.getKey()); // use your own ID here
    controller.init(globalAttributes);

    /*
     * The controller is now ready to perform queries. To show that the documents from
     * the document input are cached, we will perform the same query twice and measure
     * the time for each query.
     */
    ProcessingResult result;
    long start, duration;

    final Map<String, Object> attributes;
    attributes = new HashMap<String, Object>();
    CommonAttributesDescriptor.attributeBuilder(attributes).query("data mining");

    start = System.currentTimeMillis();
    result = controller.process(attributes, Bing3WebDocumentSource.class,
        LingoClusteringAlgorithm.class);
    duration = System.currentTimeMillis() - start;
    System.out.println(duration + " ms (empty cache)");

    start = System.currentTimeMillis();
    result = controller.process(attributes, Bing3WebDocumentSource.class,
        LingoClusteringAlgorithm.class);
    duration = System.currentTimeMillis() - start;
    System.out.println(duration + " ms (documents and clusters from cache)");
    // [[[end:using-caching-controller]]]
}
 
Example #17
Source File: ClusteringNonEnglishContent.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public static void main(String [] args)
{
    // [[[start:clustering-non-english-content]]]
    /*
     * We use a Controller that reuse instances of Carrot2 processing components 
     * and caches results produced by document sources.
     */
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    /*
     * In the first call, we'll cluster a document list, setting the language for each
     * document separately.
     */
    final List<Document> documents = Lists.newArrayList();
    for (Document document : SampleDocumentData.DOCUMENTS_DATA_MINING)
    {
        documents.add(new Document(document.getTitle(), document.getSummary(),
            document.getContentUrl(), LanguageCode.ENGLISH));
    }

    final Map<String, Object> attributes = Maps.newHashMap();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .documents(documents);
    final ProcessingResult englishResult = controller.process(
        attributes, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(englishResult);

    /*
     * In the second call, we will fetch results for a Chinese query from Bing,
     * setting explicitly the Bing's specific language attribute. Based on that
     * attribute, the document source will set the appropriate language for each
     * document.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    Bing3WebDocumentSourceDescriptor.attributeBuilder(attributes)
        .market(MarketOption.CHINESE_CHINA);
    Bing3WebDocumentSourceDescriptor
        .attributeBuilder(attributes)
            .appid(BingKeyAccess.getKey()); // use your own ID here!

    final ProcessingResult chineseResult = controller.process(attributes,
        Bing3WebDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult);

    /*
     * In the third call, we will fetch results for the same Chinese query from
     * Google. As Google document source does not have its specific attribute for
     * setting the language, it will not set the documents' language for us. To make
     * sure the right lexical resources are used, we will need to set the
     * MultilingualClustering.defaultLanguage attribute to Chinese on our own.
     */
    attributes.clear();
    
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("聚类" /* clustering? */)
        .results(100);

    MultilingualClusteringDescriptor.attributeBuilder(attributes)
        .defaultLanguage(LanguageCode.CHINESE_SIMPLIFIED);

    final ProcessingResult chineseResult2 = controller.process(attributes,
        GoogleDocumentSource.class, LingoClusteringAlgorithm.class);
    ConsoleFormatter.displayResults(chineseResult2);
    // [[[end:clustering-non-english-content]]]
}
 
Example #18
Source File: MoreConfigurationsOfOneAlgorithmInCachingController.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
@SuppressWarnings(
{
    "unchecked"
})
public static void main(String [] args)
{
    /*
     * Create a controller that caches all documents.
     */
    final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);

    /*
     * You can define global values for some attributes. These will apply to all
     * configurations we will define below, unless the specific configuration
     * overrides the global attributes.
     */
    final Map<String, Object> globalAttributes = new HashMap<String, Object>();

    CompletePreprocessingPipelineDescriptor.attributeBuilder(globalAttributes)
            .documentAssigner()
                .exactPhraseAssignment(false);

    /*
     * Now we will define two different configurations of the Lingo algorithm. One
     * will be optimized for speed of clustering, while the other will optimize the
     * quality of clusters.
     */
    final Map<String, Object> fastAttributes = Maps.newHashMap();
    LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes)
        .desiredClusterCountBase(20)
        .matrixReducer()
            .factorizationQuality(FactorizationQuality.LOW);

    CompletePreprocessingPipelineDescriptor.attributeBuilder(fastAttributes)
            .caseNormalizer()
                .dfThreshold(2);

    final Map<String, Object> accurateAttributes = Maps.newHashMap();
    LingoClusteringAlgorithmDescriptor.attributeBuilder(accurateAttributes)
        .desiredClusterCountBase(40)
        .matrixReducer()
            .factorizationQuality(FactorizationQuality.HIGH);

    CompletePreprocessingPipelineDescriptor.attributeBuilder(accurateAttributes)
            .documentAssigner()
                .exactPhraseAssignment(true);

    CompletePreprocessingPipelineDescriptor.attributeBuilder(fastAttributes)
            .caseNormalizer()
                .dfThreshold(1);

    /*
     * We initialize the controller passing the global attributes and the two 
     * configurations. Notice that a configuration consists of the component
     * class (can be a document source as well as a clustering algorithm), its 
     * string identifier and attributes.
     */
    controller.init(globalAttributes, 
        new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class, 
            "lingo-fast", fastAttributes),
        new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class, 
            "lingo-accurate", accurateAttributes)
    );
    
    /*
     * Now we can call the two different clustering configurations. Notice that 
     * because we now use string identifiers instead of classes, we pass the document
     * source class name rather than the class itself.
     */
    final Map<String, Object> attributes = new HashMap<String, Object>();
    CommonAttributesDescriptor.attributeBuilder(attributes)
        .query("data mining");

    final ProcessingResult fastResult = controller.process(attributes,
        Bing3WebDocumentSource.class.getName(), "lingo-fast");
    ConsoleFormatter.displayClusters(fastResult.getClusters());
    
    final ProcessingResult accurateResult = controller.process(attributes,
        Bing3WebDocumentSource.class.getName(), "lingo-accurate");
    ConsoleFormatter.displayClusters(accurateResult.getClusters());
}
 
Example #19
Source File: ClusteringQualityBenchmark.java    From scava with Eclipse Public License 2.0 4 votes vote down vote up
public static void main(String [] args)
{
    // Disable excessive logging
    final AmbientTopic [] topics = AmbientDocumentSource.AmbientTopic.values();
    final Controller controller = ControllerFactory.createSimple();

    // List of algorithms to test
    final ArrayList<Class<? extends IProcessingComponent>> algorithms = Lists
        .newArrayList();
    algorithms.add(LingoClusteringAlgorithm.class);
    algorithms.add(STCClusteringAlgorithm.class);

    TabularOutput t = new TabularOutput(new PrintWriter(System.out));
    t.columnSeparator(" | ");
    t.defaultFormat(Double.class).format("%.3f");
    t.addColumn("Topic").alignLeft().format("%-18s");
    t.addColumn("Algorithm").alignLeft().format("%-15s");

    for (AmbientTopic topic : topics)
    {
        for (Class<? extends IProcessingComponent> algorithm : algorithms)
        {
            final Map<String, Object> attributes = Maps.newHashMap();
            AmbientDocumentSourceDescriptor.attributeBuilder(attributes).topic(topic);

            ProcessingResult result = controller.process(
                attributes, AmbientDocumentSource.class, algorithm, ClusteringMetricsCalculator.class);

            t.rowData("Topic", topic.name());
            t.rowData("Algorithm", algorithm.getSimpleName());

            Map<String, Object> attrs = result.getAttributes();

            t.rowData(
                "Contamination", 
                attrs.get(ContaminationMetricDescriptor.Keys.WEIGHTED_AVERAGE_CONTAMINATION));

            t.rowData(
                "F-Score", 
                attrs.get(PrecisionRecallMetricDescriptor.Keys.WEIGHTED_AVERAGE_F_MEASURE));

            t.rowData(
                "Precision", 
                attrs.get(PrecisionRecallMetricDescriptor.Keys.WEIGHTED_AVERAGE_PRECISION));

            t.rowData(
                "Recall", 
                attrs.get(PrecisionRecallMetricDescriptor.Keys.WEIGHTED_AVERAGE_RECALL));

            t.rowData(
                "NMI", 
                attrs.get(NormalizedMutualInformationMetricDescriptor.Keys.NORMALIZED_MUTUAL_INFORMATION));

            t.nextRow();
        }
    }

}