# -*- coding: utf-8 -*- """ Functions for fetching categorization datasets """ from sklearn.datasets.base import Bunch from .utils import _get_cluster_assignments def fetch_AP(): """ Fetch Almuhareb and Abdulrahman categorization dataset Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'clusters': dict of arrays of words representing References ---------- Almuhareb et al., "Concept learning and categorization from the web", 2005 Notes ----- Authors description: Our goal was to create a dataset balanced with respect to three factors: class type, frequency, and ambiguity. First of all, we aimed to include one class of nouns for each of the 21 unique beginners of the WordNet noun hierarchy4 . We chose subclasses for each of these 21 beginners that would represent a reasonably natural cluster: e.g., the hyponym social occasion for the unique beginner event. From each such class, we selected between 13 and 21 nouns to be representative concepts for the class (e.g., ceremony, feast, and graduation for the class social occasion). Secondly, we aimed to include about 1/3 high frequency nouns, 1/3 medium frequency, and 1/3 low frequency. Noun frequencies where estimated using the British National Corpus. We considered as highly frequent those nouns with frequency 1,000 or more; as medium frequent the nouns with between 1,000 and 100 occurrences; and those between 100 and 5 as low frequent. Thirdly, we wanted the dataset to be balanced as to ambiguity, estimated on the basis of the number of senses in WordNet. Nouns with 4 or more senses were considered highly ambiguous; nouns with 2 or 3 senses medium ambiguous; and nouns with a single sense as not ambiguous. """ return _get_cluster_assignments(dataset_name="EN-AP", url="https://www.dropbox.com/sh/6xu1c1aan8f83p3/AACMyoLwncNhRkUkqvGurYB6a?dl=1") def fetch_BLESS(): """ Fetch Baroni and Marco categorization dataset Parameters ------- Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Baroni et al. "How we BLESSed distributional semantic evaluation", 2011 Notes ----- Data set includes 200 concrete nouns (100 animate and 100 inanimate nouns) from different classes (e.g., tools, clothing, vehicles, animals, etc.). """ return _get_cluster_assignments(dataset_name="EN-BLESS", url="https://www.dropbox.com/sh/5qbl5cmh17o3eh0/AACyCEqpMktdMI05zwphJRI7a?dl=1") def fetch_battig(): """ Fetch 1969 Battig dataset Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment 'freq': frequency of response 'frequency': Kucera-Francis word frequency 'rank': rank of frequence within response 'rfreq': rated frequency References ---------- W.F Battig & W.E Montague (1968). Category norms for verbal items in 56 categories: A replication and extension of the Connecticut norms using University of Maryland and Illinois students (Tech. Rep.) University of Colorado, Boulder, CO (1968) Notes ----- This dataset comprises a ranked list of 5231 words listed in 56 taxonomic categories by people who were asked to list as many exemplars of a given category ("a precious stone", "a unit of time", "a fruit", "a color", etc.). Participants had 30s to generate as many responses to each category as possible, after which time the next category name was presented. Included in this dataset are all words from the Battig and Montague (1969) norms listed with freq > 1. This is not the same dataset as 'battig' in Baroni et al. "Don’t count, predict! A systematic comparison of context-counting vs. context-predicting semantic vectors" """ data = _get_cluster_assignments(dataset_name="EN-BATTIG", url="https://www.dropbox.com/sh/ckp4yu7k7xl7u2a/AABhmpgU3ake3T9liA9BR8EBa?dl=1", sep=",", skip_header=True) return Bunch(X=data.X[:, 0], y=data.y, freq=data.X[:, 1], frequency=data.X[:, 2], rank=data.X[:, 3], rfreq=data.X[:, 4]) def fetch_ESSLI_2c(): """ Fetch ESSLI 2c task categorization dataset Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:verb_categorization Notes ----- The goal of the sub-task is to group verbs into semantic categories. The data set consists of 45 verbs, belonging to 9 semantic classes. The classification scheme is inspired by P. Vinson & G. Vigliocco (2007), “Semantic Feature Production Norms for a Large Set of Objects and Events”, Behavior Research Methods, which in turn closely follows the classification proposed in Levin (1993). The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates). The nouns are included in the feature norms described in McRae et al. (2005) """ return _get_cluster_assignments(dataset_name="EN-ESSLI-2c", url="https://www.dropbox.com/sh/d3mcyl3b5mawfhm/AAABygW1rguhI4L0XSw_I68ta?dl=1") def fetch_ESSLI_2b(): """ Fetch ESSLI 2c task categorization dataset Parameters ------- Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:abstract_concrete_nouns_discrimination. Notes ----- The data set consists of 40 nouns extracted from the MRC Psycholinguistic Database, with ratings by human subjects on the concreteness scale. The nouns have been classified into three classes: HI, LO and ME being highly, low and medium abstract nouns. """ return _get_cluster_assignments(dataset_name="EN-ESSLI-2b", url="https://www.dropbox.com/sh/7gdv52gy9vb4mf2/AACExLgHdbvbBrRZBP6CcdDaa?dl=1") def fetch_ESSLI_1a(): """ Fetch ESSLI 1a task categorization dataset. Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:concrete_nouns_categorization. Notes ----- The goal of the sub-task is to group concrete nouns into semantic categories. The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates). The nouns are included in the feature norms described in McRae et al. (2005) """ return _get_cluster_assignments(dataset_name="EN-ESSLI-1a", url="https://www.dropbox.com/sh/h362565r1sk5wii/AADjcdYy3nRo-MjuFUSvb-0ya?dl=1")