Python sklearn.datasets.base.Bunch() Examples

The following are 30 code examples of sklearn.datasets.base.Bunch(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.datasets.base , or try the search function

Example #1

Source File: base.py From sklearn-theano with BSD 3-Clause "New" or "Revised" License

6 votes

def load_sample_images():
    """Load sample images for image manipulation.
    Loads ``sloth``, ``sloth_closeup``, ``cat_and_dog``.

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'filenames', the file
        names for the images, and 'DESCR'
        the full description of the dataset.
    """
    module_path = os.path.join(os.path.dirname(__file__), "images")
    with open(os.path.join(module_path, 'README.txt')) as f:
        descr = f.read()
    filenames = [os.path.join(module_path, filename)
                 for filename in os.listdir(module_path)
                 if filename.endswith(".jpg")]
    # Load image data for each image in the source folder.
    images = [np.array(Image.open(filename, 'r')) for filename in filenames]

    return Bunch(images=images,
                 filenames=filenames,
                 DESCR=descr)

Example #2

Source File: poiRegression.py From python-urbanPlanning with MIT License

6 votes

def json2bunch(fName):   #传入数据，面向不同的数据存储方式，需要调整函数内读取的代码
    infoDic=[]
    f=open(fName)
    jsonDecodes=json.load(f)
    j=0
    for info in jsonDecodes:
        condiKeys=info['detail_info'].keys()
        if 'price' in condiKeys and'overall_rating' in condiKeys and 'service_rating' in condiKeys and 'facility_rating' in condiKeys and 'hygiene_rating' in condiKeys and 'image_num' in condiKeys and 'comment_num' in condiKeys and 'favorite_num' in condiKeys: #提取的键都有数据时，才提取，否则忽略掉此数据
            if 50<float(info['detail_info']['price'])<1000: #设置价格区间，提取数据
                j+=1
                infoDic.append([info['location']['lat'],info['location']['lng'],info['detail_info']['price'],info['detail_info']['overall_rating'],info['detail_info']['service_rating'],info['detail_info']['facility_rating'],info['detail_info']['hygiene_rating'],info['detail_info']['image_num'],info['detail_info']['comment_num'],info['detail_info']['favorite_num'],info['detail_info']['checkin_num'],info['name']])
            else:pass
        else:pass
    print('.....................................',j)

    data=np.array([(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7],v[8],v[9],v[10]) for v in infoDic],dtype='float')  #解释变量(特征)数据部分
    targetInfo=np.array([v[11] for v in infoDic])  #目标变量(类标)部分
    dataBunch=base.Bunch(DESCR=r'info of poi',data=data,feature_names=['lat','lng','price','overall_rating','service_rating','facility_rating','hygiene_rating','image_num','comment_num','favorite_num','checkin_num'],target=targetInfo,target_names=['price','name'])  #建立sklearn的数据存储格式bunch
    return dataBunch #返回bunch格式的数据

Example #3

Source File: poiStructure.py From python-urbanPlanning with MIT License

6 votes

def jsonDataFilter(fileInfo):   #传入数据，面向不同的数据存储方式，需要调整函数内读取的代码
    rootPath=list(fileInfo.keys())  #待读取数据文件的根目录
#    print(rootPath)
    dataName=flatten_lst(list(fileInfo.values()))  #待读取数据文件的文件名列表
#    print(dataName)
    coodiDic=[]
    for fName in dataName:  #逐一读取json数据格式文件，并将需要数据存储于列表中，本次实验数据为poi的经纬度信息和一级行业分类名，注意使用了百度坐标系，未转换为WGS84.    
        f=open(os.path.join(rootPath[0],fName))
        jsonDecodes=json.load(f)
        coodiDic.append([(coordi['location']['lat'],coordi['location']['lng'],fName[:-5]) for coordi in jsonDecodes])
        coodiDic=flatten_lst(coodiDic) #读取的数据多层嵌套，需展平处理。       
#    print(coodiDic)
    data=np.array([(v[0],v[1]) for v in coodiDic])  #经纬度信息
    targetNames=np.array([v[2] for v in coodiDic])  #一级分类
#    print(data)
#    print(targetNames)
    class_label=LabelEncoder()  #以整数形式编码一级分类名
    targetLabel=class_label.fit_transform(targetNames)
    class_mapping=[(idx,label) for idx,label in enumerate(class_label.classes_)]  #建立一级分类名和整数编码的映射列表
#    print(class_mapping)
    dataBunch=base.Bunch(DESCR=r'spatial points datasets of poi',data=data,feature_names=["XCoordinate","yCoordinate"],target=targetLabel,target_names=class_mapping)  #建立sklearn的数据存储格式bunch
    return dataBunch,class_mapping  #返回bunch格式的数据和分类名映射列表

Example #4

Source File: utils.py From cpae with MIT License

6 votes

def _get_cluster_assignments(dataset_name, url, sep=" ", skip_header=False):
    data_dir = _get_dataset_dir("categorization", verbose=0)
    _fetch_file(url=url,
                 data_dir=data_dir,
                 uncompress=True,
                 move="{0}/{0}.txt".format(dataset_name),
                 verbose=0)
    files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt"))
    X = []
    y = []
    names = []
    for cluster_id, file_name in enumerate(files):
        with open(file_name) as f:
            lines = f.read().splitlines()[(int(skip_header)):]

            X += [l.split(sep) for l in lines]
            y += [os.path.basename(file_name).split(".")[0]] * len(lines)
    return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object"))

Example #5

Source File: rasterPTSextraction_statistic_poi.py From python-urbanPlanning with MIT License

6 votes

def jsonDataFilter(fileInfo):   #传入数据，面向不同的数据存储方式，需要调整函数内读取的代码
    rootPath=list(fileInfo.keys())  #待读取数据文件的根目录
#    print(rootPath)
    dataName=flatten_lst(list(fileInfo.values()))  #待读取数据文件的文件名列表
#    print(dataName)
    coodiDic=[]
    for fName in dataName:  #逐一读取json数据格式文件，并将需要数据存储于列表中，本次实验数据为poi的经纬度信息和一级行业分类名，注意使用了百度坐标系，未转换为WGS84.    
        f=open(os.path.join(rootPath[0],fName))
        jsonDecodes=json.load(f)
        coodiDic.append([(coordi['location']['lat'],coordi['location']['lng'],fName[:-5]) for coordi in jsonDecodes])
        coodiDic=flatten_lst(coodiDic) #读取的数据多层嵌套，需展平处理。       
#    print(coodiDic)
    data=np.array([(v[0],v[1]) for v in coodiDic])  #经纬度信息
    targetNames=np.array([v[2] for v in coodiDic])  #一级分类
#    print(data)
#    print(targetNames)
    class_label=LabelEncoder()  #以整数形式编码一级分类名
    targetLabel=class_label.fit_transform(targetNames)
    class_mapping=[(idx,label) for idx,label in enumerate(class_label.classes_)]  #建立一级分类名和整数编码的映射列表
#    print(class_mapping)
    dataBunch=base.Bunch(DESCR=r'spatial points datasets of poi',data=data,feature_names=["XCoordinate","yCoordinate"],target=targetLabel,target_names=class_mapping)  #建立sklearn的数据存储格式bunch
    return dataBunch,class_mapping  #返回bunch格式的数据和分类名映射列表

Example #6

Source File: utils.py From word-embeddings-benchmarks with MIT License

6 votes

def _get_cluster_assignments(dataset_name, url, sep=" ", skip_header=False):
    data_dir = _get_dataset_dir("categorization", verbose=0)
    _fetch_file(url=url,
                 data_dir=data_dir,
                 uncompress=True,
                 move="{0}/{0}.txt".format(dataset_name),
                 verbose=0)
    files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt"))
    X = []
    y = []
    names = []
    for cluster_id, file_name in enumerate(files):
        with open(file_name) as f:
            lines = f.read().splitlines()[(int(skip_header)):]

            X += [l.split(sep) for l in lines]
            y += [os.path.basename(file_name).split(".")[0]] * len(lines)
    return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object"))

Example #7

Source File: categorization.py From word-embeddings-benchmarks with MIT License

6 votes

def fetch_BLESS():
    """
    Fetch Baroni and Marco categorization dataset

    Parameters
    -------

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': words
        'y': cluster assignment

    References
    ----------
    Baroni et al. "How we BLESSed distributional semantic evaluation", 2011

    Notes
    -----
    Data set includes 200 concrete nouns (100 animate and 100 inanimate nouns)
    from different classes (e.g., tools, clothing, vehicles, animals, etc.).
    """
    return _get_cluster_assignments(dataset_name="EN-BLESS",
                                    url="https://www.dropbox.com/sh/5qbl5cmh17o3eh0/AACyCEqpMktdMI05zwphJRI7a?dl=1")

Example #8

Source File: categorization.py From cpae with MIT License

6 votes

def fetch_BLESS():
    """
    Fetch Baroni and Marco categorization dataset

    Parameters
    -------

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': words
        'y': cluster assignment

    References
    ----------
    Baroni et al. "How we BLESSed distributional semantic evaluation", 2011

    Notes
    -----
    Data set includes 200 concrete nouns (100 animate and 100 inanimate nouns)
    from different classes (e.g., tools, clothing, vehicles, animals, etc.).
    """
    return _get_cluster_assignments(dataset_name="EN-BLESS",
                                    url="https://www.dropbox.com/sh/5qbl5cmh17o3eh0/AACyCEqpMktdMI05zwphJRI7a?dl=1")

Example #9

Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
    bunch = Bunch(key='original')
    # This reproduces a problem when Bunch pickles have been created
    # with scikit-learn 0.16 and are read with 0.17. Basically there
    # is a surprising behaviour because reading bunch.key uses
    # bunch.__dict__ (which is non empty for 0.16 Bunch objects)
    # whereas assigning into bunch.key uses bunch.__setattr__. See
    # https://github.com/scikit-learn/scikit-learn/issues/6196 for
    # more details
    bunch.__dict__['key'] = 'set from __dict__'
    bunch_from_pkl = loads(dumps(bunch))
    # After loading from pickle the __dict__ should have been ignored
    assert_equal(bunch_from_pkl.key, 'original')
    assert_equal(bunch_from_pkl['key'], 'original')
    # Making sure that changing the attr does change the value
    # associated with __getitem__ as well
    bunch_from_pkl.key = 'changed'
    assert_equal(bunch_from_pkl.key, 'changed')
    assert_equal(bunch_from_pkl['key'], 'changed')

Example #10

Source File: categorization.py From word-embeddings-benchmarks with MIT License

6 votes

def fetch_ESSLI_1a():
    """
    Fetch ESSLI 1a task categorization dataset.

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': words
        'y': cluster assignment

    References
    ----------
    Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:concrete_nouns_categorization.

    Notes
    -----
    The goal of the sub-task is to group concrete nouns into semantic categories.
    The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates).
    The nouns are included in the feature norms described in McRae et al. (2005)
    """
    return _get_cluster_assignments(dataset_name="EN-ESSLI-1a",
                                    url="https://www.dropbox.com/sh/h362565r1sk5wii/AADjcdYy3nRo-MjuFUSvb-0ya?dl=1")

Example #11

Source File: base.py From sklearn-theano with BSD 3-Clause "New" or "Revised" License

6 votes

def load_images(filenames):
    """Load images for image manipulation.

    Parameters
    ----------
    filenames : iterable
         Iterable of filename paths as strings

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'filenames', the file
        names for the images
    """
    # Load image data for each image in the source folder.
    images = [np.array(Image.open(filename, 'r')) for filename in filenames]

    return Bunch(images=images,
                 filenames=filenames)

Example #12

Source File: asirra.py From sklearn-theano with BSD 3-Clause "New" or "Revised" License

6 votes

def fetch_asirra(image_count=1000):
    """

    Parameters
    ----------
    image_count : positive integer

    Returns
    -------
    data : Bunch
        Dictionary-like object with the following attributes :
        'images', the sample images, 'data', the flattened images,
        'target', the label for the image (0 for cat, 1 for dog),
        and 'DESCR' the full description of the dataset.
    """
    partial_path = check_fetch_asirra()
    m = Memory(cachedir=partial_path, compress=6, verbose=0)
    load_func = m.cache(_fetch_asirra)
    images, target = load_func(partial_path, image_count=image_count)
    return Bunch(data=images.reshape(len(images), -1),
                 images=images, target=target,
                 DESCR="Asirra cats and dogs dataset")

Example #13

Source File: categorization.py From cpae with MIT License

6 votes

def fetch_ESSLI_1a():
    """
    Fetch ESSLI 1a task categorization dataset.

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': words
        'y': cluster assignment

    References
    ----------
    Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:concrete_nouns_categorization.

    Notes
    -----
    The goal of the sub-task is to group concrete nouns into semantic categories.
    The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates).
    The nouns are included in the feature norms described in McRae et al. (2005)
    """
    return _get_cluster_assignments(dataset_name="EN-ESSLI-1a",
                                    url="https://www.dropbox.com/sh/h362565r1sk5wii/AADjcdYy3nRo-MjuFUSvb-0ya?dl=1")

Example #14

Source File: similarity.py From cpae with MIT License

6 votes

def fetch_SCWS():
    """
    Fetch SCWS dataset for testing similarity (with a context)

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with mean scores,
        'sd': standard deviation of scores

    References
    ----------
    Huang et al., "Improving Word Representations via Global Context and Multiple Word Prototypes", 2012

    Notes
    -----
    TODO
    """
    data = _get_as_pd('https://www.dropbox.com/s/qgqj366lzzzj1ua/preproc_SCWS.txt?dl=1', 'similarity', header=None, sep="\t")
    X = data.values[:, 0:2].astype("object")
    mean = data.values[:,2].astype(np.float)
    sd = np.std(data.values[:, 3:14].astype(np.float), axis=1).flatten()
    return Bunch(X=X, y=mean,sd=sd)

Example #15

Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License

6 votes

def _make_path_events_file_spm_auditory_data(spm_auditory_data):
    """
    Accepts data for spm_auditory dataset as Bunch
    and constructs the filepath for its events descriptor file.
    Parameters
    ----------
    spm_auditory_data: Bunch

    Returns
    -------
    events_filepath: string
        Full path to the events.tsv file for spm_auditory dataset.
    """
    events_file_location = os.path.dirname(spm_auditory_data['func'][0])
    events_filename = os.path.basename(events_file_location) + '_events.tsv'
    events_filepath = os.path.join(events_file_location, events_filename)
    return events_filepath

Example #16

Source File: test_base.py From twitter-stock-recommendation with MIT License

6 votes

def test_bunch_pickle_generated_with_0_16_and_read_with_0_17():
    bunch = Bunch(key='original')
    # This reproduces a problem when Bunch pickles have been created
    # with scikit-learn 0.16 and are read with 0.17. Basically there
    # is a suprising behaviour because reading bunch.key uses
    # bunch.__dict__ (which is non empty for 0.16 Bunch objects)
    # whereas assigning into bunch.key uses bunch.__setattr__. See
    # https://github.com/scikit-learn/scikit-learn/issues/6196 for
    # more details
    bunch.__dict__['key'] = 'set from __dict__'
    bunch_from_pkl = loads(dumps(bunch))
    # After loading from pickle the __dict__ should have been ignored
    assert_equal(bunch_from_pkl.key, 'original')
    assert_equal(bunch_from_pkl['key'], 'original')
    # Making sure that changing the attr does change the value
    # associated with __getitem__ as well
    bunch_from_pkl.key = 'changed'
    assert_equal(bunch_from_pkl.key, 'changed')
    assert_equal(bunch_from_pkl['key'], 'changed')

Example #17

Source File: similarity.py From cpae with MIT License

5 votes

def fetch_SimVerb3500(which='all'):
    """
    Fetch SimVerb3500 dataset for testing verb similarity

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,

    References
    ----------
    Gerz, Daniela et al., "SimVerb-3500: A Large-Scale Evaluation Set of Verb Similarity", 2016

    Notes
    -----
    TODO
    """
    if which not in ['all', 'dev', 'test']:
        raise RuntimeError("Not recognized which parameter")

    url_map = {"all": 'https://www.dropbox.com/s/xct7j3h7i9bzi7y/all_SimVerb3500.txt?dl=1',
               "dev": 'https://www.dropbox.com/s/57d850d6puxl6nm/dev_SimVerb3500.txt?dl=1',
               "test": 'https://www.dropbox.com/s/66hlkkhfa6c9lrt/test_SimVerb3500.txt?dl=1'}

    data = _get_as_pd(url_map[which], which, header=None, sep=" ")
    return Bunch(X=data.values[:, 0:2].astype("object"), y=data.values[:, 2:].astype(np.float))

Example #18

Source File: beamformers_electrodes_tweak.py From mmvt with GNU General Public License v3.0

5 votes

def find_best_freqs_subset(event_id, bipolar, freqs_bins, from_t, to_t, time_split, combs,
        optimization_method='RidgeCV', optimization_params={}, k=3, gk_sigma=3, njobs=6):
    freqs_bins = sorted(freqs_bins)
    all_electrodes = get_all_electrodes_names(bipolar)
    elec_data = load_electrodes_data(event_id, bipolar, all_electrodes, from_t, to_t,
            subtract_min=False, normalize_data=False)
    meg_data_dic = load_all_dics(freqs_bins, event_id, bipolar, all_electrodes, from_t, to_t, gk_sigma,
        dont_calc_new_csd=True, njobs=njobs)

    uuid = utils.rand_letters(5)
    results_fol = get_results_fol(optimization_method)
    partial_results_fol = os.path.join(results_fol, 'best_freqs_subset_{}'.format(uuid))
    utils.make_dir(results_fol)
    utils.make_dir(partial_results_fol)

    cond = utils.first_key(event_id)
    all_freqs_bins_subsets = list(utils.superset(freqs_bins))
    random.shuffle(all_freqs_bins_subsets)
    N = len(all_freqs_bins_subsets)
    print('There are {} freqs subsets'.format(N))
    all_freqs_bins_subsets_chunks = utils.chunks(all_freqs_bins_subsets, int(len(all_freqs_bins_subsets) / njobs))
    params = [Bunch(event_id=event_id, bipolar=bipolar, freqs_bins_chunks=freqs_bins_subsets_chunk, cond=cond,
            from_t=from_t, to_t=to_t, freqs_bins=freqs_bins, partial_results_fol=partial_results_fol,
            time_split=time_split, only_sig_electrodes=False, only_from_same_lead=True, electrodes_positive=False,
            electrodes_normalize=False, gk_sigma=gk_sigma, k=k, do_plot_results=False, do_save_partial_results=False,
            optimization_params=optimization_params, check_only_pred_score=True, njobs=1, N=int(N / njobs),
            elec_data=elec_data, meg_data_dic=meg_data_dic, all_electrodes=all_electrodes,
            optimization_method=optimization_method, error_calc_method='rol_corr', error_threshold=30, combs=combs) for
            freqs_bins_subsets_chunk in all_freqs_bins_subsets_chunks]
    results = utils.run_parallel(_find_best_freqs_subset_parallel, params, njobs)
    all_results = []
    for chunk_results in results:
        all_results.extend(chunk_results)
    params_suffix = utils.params_suffix(optimization_params)
    output_file = os.path.join(results_fol, 'best_freqs_subset_{}_{}_{}{}.pkl'.format(cond, uuid, k, params_suffix))
    print('saving results to {}'.format(output_file))
    utils.save((chunk_results, freqs_bins), output_file)

Example #19

Source File: utils.py From mmvt with GNU General Public License v3.0

5 votes

def dic2bunch(dic):
    from sklearn.datasets.base import Bunch
    return Bunch(**dic)

Example #20

Source File: base.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _maybe_sklearn_data(data, target):
    if isinstance(data, Bunch):
        if target is not None:
            raise ValueError("'target' can't be specified for sklearn.datasets")
        # this should be first
        target = data.target
        # instanciate here to add column name
        columns = getattr(data, 'feature_names', None)
        data = pd.DataFrame(data.data, columns=columns)
        return data, target

    return data, target

Example #21

Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License

5 votes

def fetch_localizer_first_level(data_dir=None, verbose=1):
    """ Download a first-level localizer fMRI dataset

    Parameters
    ----------
    data_dir: string
        directory where data should be downloaded and unpacked.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        dictionary-like object, with the keys:
        epi_img: the input 4D image
        events: a csv file describing the paardigm
    """
    url = 'https://osf.io/2bqxn/download'
    epi_img = 'sub-12069_task-localizer_space-MNI305.nii.gz'
    events = 'sub-12069_task-localizer_events.tsv'
    opts = {'uncompress': True}
    options = ('epi_img', 'events')
    dir_ = 'localizer_first_level'
    filenames = [(os.path.join(dir_, name), url, opts)
                 for name in [epi_img, events]]

    dataset_name = 'localizer_first_level'
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
                                verbose=verbose)
    files = _fetch_files(data_dir, filenames, verbose=verbose)

    params = dict(list(zip(options, files)))
    return Bunch(**params)

Example #22

Source File: categorization.py From word-embeddings-benchmarks with MIT License

5 votes

def fetch_battig():
    """
    Fetch 1969 Battig dataset

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': words
        'y': cluster assignment
        'freq': frequency of response
        'frequency': Kucera-Francis word frequency
        'rank': rank of frequence within response
        'rfreq': rated frequency

    References
    ----------
    W.F Battig & W.E Montague (1968). Category norms for verbal items in 56 categories: A replication
    and extension of the Connecticut norms using University of Maryland and Illinois students
    (Tech. Rep.) University of Colorado, Boulder, CO (1968)

    Notes
    -----
    This dataset comprises a ranked list of 5231 words listed in 56 taxonomic categories by people
    who were asked to list as many exemplars of a given category ("a precious stone", "a unit of time",
    "a fruit", "a color", etc.). Participants had 30s to generate as many responses to each category as
    possible, after which time the next category name was presented.
    Included in this dataset are all words from the Battig and Montague (1969) norms listed with
    freq > 1.

    This is not the same dataset as 'battig' in Baroni et al. "Don’t count, predict! A systematic comparison of
    context-counting vs. context-predicting semantic vectors"
    """
    data = _get_cluster_assignments(dataset_name="EN-BATTIG",
                                    url="https://www.dropbox.com/sh/ckp4yu7k7xl7u2a/AABhmpgU3ake3T9liA9BR8EBa?dl=1",
                                    sep=",", skip_header=True)
    return Bunch(X=data.X[:, 0], y=data.y,
                 freq=data.X[:, 1], frequency=data.X[:, 2], rank=data.X[:, 3], rfreq=data.X[:, 4])

Example #23

Source File: categorization.py From word-embeddings-benchmarks with MIT License

5 votes

def fetch_ESSLI_2b():
    """
    Fetch ESSLI 2c task categorization dataset

    Parameters
    -------

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': words
        'y': cluster assignment

    References
    ----------
    Originally published at
    http://wordspace.collocations.de/doku.php/data:esslli2008:abstract_concrete_nouns_discrimination.

    Notes
    -----
    The data set consists of 40 nouns extracted from the MRC Psycholinguistic Database, with ratings by human subjects
    on the concreteness scale. The nouns have been classified into three classes: HI, LO and ME being highly,
    low and medium abstract nouns.
    """
    return _get_cluster_assignments(dataset_name="EN-ESSLI-2b",
                                    url="https://www.dropbox.com/sh/7gdv52gy9vb4mf2/AACExLgHdbvbBrRZBP6CcdDaa?dl=1")

Example #24

Source File: similarity.py From word-embeddings-benchmarks with MIT License

5 votes

def fetch_MTurk():
    """
    Fetch MTurk dataset for testing attributional similarity

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,

    References
    ----------
    Radinsky, Kira et al., "A Word at a Time: Computing Word Relatedness Using Temporal Semantic Analysis", 2011

    Notes
    -----
    Human labeled examples of word semantic relatedness. The data pairs were generated using an algorithm as
    described in the paper by [K. Radinsky, E. Agichtein, E. Gabrilovich, S. Markovitch.].
    Each pair of words was evaluated by 10 people on a scale of 1-5.

    Additionally scores were multiplied by factor of 2.
    """
    data = _get_as_pd('https://www.dropbox.com/s/f1v4ve495mmd9pw/EN-TRUK.txt?dl=1',
                      'similarity', header=None, sep=" ").values
    return Bunch(X=data[:, 0:2].astype("object"),
                 y=2 * data[:, 2].astype(np.float))

Example #25

Source File: similarity.py From word-embeddings-benchmarks with MIT License

5 votes

def fetch_RW():
    """
    Fetch Rare Words dataset for testing attributional similarity

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'sd': vector of std of scores

    References
    ----------
    Published at http://www-nlp.stanford.edu/~lmthang/morphoNLM/.

    Notes
    -----
    2034 word pairs that are relatively rare with human similarity scores. Rare word selection: our choices of
    rare words (word1) are based on their frequencies – based on five bins (5, 10], (10, 100], (100, 1000],
    (1000, 10000], and the affixes they possess. To create a diverse set of candidates, we randomly
    select 15 words for each configuration (a frequency bin, an affix). At the scale of Wikipedia,
    a word with frequency of 1-5 is most likely a junk word, and even restricted to words with
    frequencies above five, there are still many non-English words. To counter such problems,
    each word selected is required to have a non-zero number of synsets in WordNet(Miller, 1995).
    """
    data = _get_as_pd('https://www.dropbox.com/s/xhimnr51kcla62k/EN-RW.txt?dl=1',
                      'similarity', header=None, sep="\t").values
    return Bunch(X=data[:, 0:2].astype("object"),
                 y=data[:, 2].astype(np.float),
                 sd=np.std(data[:, 3:].astype(np.float)))

Example #26

Source File: similarity.py From word-embeddings-benchmarks with MIT License

5 votes

def fetch_SimLex999():
    """
    Fetch SimLex999 dataset for testing attributional similarity

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'sd': vector of sd of scores,
        'conc': matrix with columns conc(w1), conc(w2) and concQ the from dataset
        'POS': vector with POS tag
        'assoc': matrix with columns denoting free association: Assoc(USF) and SimAssoc333

    References
    ----------
    Hill, Felix et al., "Simlex-999: Evaluating semantic models with (genuine) similarity estimation", 2014

    Notes
    -----
     SimLex-999 is a gold standard resource for the evaluation of models that learn the meaning of words and concepts.
     SimLex-999 provides a way of measuring how well models capture similarity, rather than relatedness or
     association. The scores in SimLex-999 therefore differ from other well-known evaluation datasets
     such as WordSim-353 (Finkelstein et al. 2002). The following two example pairs illustrate the
     difference - note that clothes are not similar to closets (different materials, function etc.),
     even though they are very much related: coast - shore 9.00 9.10, clothes - closet 1.96 8.00
    """
    data = _get_as_pd('https://www.dropbox.com/s/0jpa1x8vpmk3ych/EN-SIM999.txt?dl=1',
                      'similarity', sep="\t")

    # We basically select all the columns available
    X = data[['word1', 'word2']].values
    y = data['SimLex999'].values
    sd = data['SD(SimLex)'].values
    conc = data[['conc(w1)', 'conc(w2)', 'concQ']].values
    POS = data[['POS']].values
    assoc = data[['Assoc(USF)', 'SimAssoc333']].values

    return Bunch(X=X.astype("object"), y=y, sd=sd, conc=conc, POS=POS, assoc=assoc)

Example #27

Source File: similarity.py From word-embeddings-benchmarks with MIT License

5 votes

def fetch_TR9856():
    """
    Fetch TR9856 dataset for testing multi-word term relatedness

    Returns
    -------
    data : sklearn.datasets.base.Bunch
        dictionary-like object. Keys of interest:
        'X': matrix of 2 words per column,
        'y': vector with scores,
        'topic': vector of topics providing context for each pair of terms

    References
    ----------
    Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015.

    Notes
    -----
    """
    data = pd.read_csv(os.path.join(_fetch_file(
        'https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip',
        'similarity', uncompress=True, verbose=0),
        'IBM_Debater_(R)_TR9856.v0.2', 'TermRelatednessResults.csv'), encoding="iso-8859-1")

    # We basically select all the columns available
    X = data[['term1', 'term2']].values
    y = data['score'].values
    topic = data['topic'].values

    return Bunch(X=X.astype("object"), y=y, topic=topic)

Example #28

Source File: test_base.py From twitter-stock-recommendation with MIT License

5 votes

def test_loads_dumps_bunch():
    bunch = Bunch(x="x")
    bunch_from_pkl = loads(dumps(bunch))
    bunch_from_pkl.x = "y"
    assert_equal(bunch_from_pkl['x'], bunch_from_pkl.x)

Example #29

Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License

5 votes

def fetch_spm_auditory(data_dir=None, data_name='spm_auditory',
                       subject_id='sub001', verbose=1):
    """Function to fetch SPM auditory single-subject data.

    Parameters
    ----------
    data_dir: string
        Path of the data directory. Used to force data storage in a specified
        location. If the data is already present there, then will simply
        glob it.

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        Dictionary-like object, the interest attributes are:
        - 'func': string list. Paths to functional images
        - 'anat': string list. Path to anat image

    References
    ----------
    :download:
        http://www.fil.ion.ucl.ac.uk/spm/data/auditory/

    """
    data_dir = _get_dataset_dir(data_name, data_dir=data_dir,
                                verbose=verbose)
    subject_dir = os.path.join(data_dir, subject_id)
    if not os.path.exists(subject_dir):
        _download_spm_auditory_data(data_dir, subject_dir, subject_id)
    spm_auditory_data = _prepare_downloaded_spm_auditory_data(subject_dir)
    try:
        spm_auditory_data['events']
    except KeyError:
        events_filepath = _make_path_events_file_spm_auditory_data(
            spm_auditory_data)
        if not os.path.isfile(events_filepath):
            _make_events_file_spm_auditory_data(events_filepath)
        spm_auditory_data['events'] = events_filepath
    return spm_auditory_data

Example #30

Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License

5 votes

def _glob_spm_multimodal_fmri_data(subject_dir):
    """glob data from subject_dir."""
    _subject_data = {'slice_order': 'descending'}

    for session in range(1, 3):
        # glob func data for session
        _subject_data = _get_func_data_spm_multimodal(subject_dir,
                                                      session,
                                                      _subject_data)
        if not _subject_data:
            return None
        # glob trials .mat file
        _subject_data = _get_session_trials_spm_multimodal(subject_dir,
                                                           session,
                                                           _subject_data)
        if not _subject_data:
            return None
        try:
            events = _make_events_file_spm_multimodal_fmri(_subject_data,
                                                           session)
        except MatReadError as mat_err:
            warnings.warn(
                '{}. An events.tsv file '
                'cannot be generated'.format(str(mat_err)))
        else:
            events_filepath = _make_events_filepath_spm_multimodal_fmri(
                _subject_data, session)
            events.to_csv(events_filepath, sep='\t', index=False)
            _subject_data['events{}'.format(session)] = events_filepath

    # glob for anat data
    _subject_data = _get_anatomical_data_spm_multimodal(subject_dir,
                                                        _subject_data)
    if not _subject_data:
        return None

    return Bunch(**_subject_data)