Python sklearn.datasets.base.Bunch() Examples
The following are 30
code examples of sklearn.datasets.base.Bunch().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.datasets.base
, or try the search function
.
Example #1
Source File: base.py From sklearn-theano with BSD 3-Clause "New" or "Revised" License | 6 votes |
def load_sample_images(): """Load sample images for image manipulation. Loads ``sloth``, ``sloth_closeup``, ``cat_and_dog``. Returns ------- data : Bunch Dictionary-like object with the following attributes : 'images', the sample images, 'filenames', the file names for the images, and 'DESCR' the full description of the dataset. """ module_path = os.path.join(os.path.dirname(__file__), "images") with open(os.path.join(module_path, 'README.txt')) as f: descr = f.read() filenames = [os.path.join(module_path, filename) for filename in os.listdir(module_path) if filename.endswith(".jpg")] # Load image data for each image in the source folder. images = [np.array(Image.open(filename, 'r')) for filename in filenames] return Bunch(images=images, filenames=filenames, DESCR=descr)
Example #2
Source File: poiRegression.py From python-urbanPlanning with MIT License | 6 votes |
def json2bunch(fName): #传入数据,面向不同的数据存储方式,需要调整函数内读取的代码 infoDic=[] f=open(fName) jsonDecodes=json.load(f) j=0 for info in jsonDecodes: condiKeys=info['detail_info'].keys() if 'price' in condiKeys and'overall_rating' in condiKeys and 'service_rating' in condiKeys and 'facility_rating' in condiKeys and 'hygiene_rating' in condiKeys and 'image_num' in condiKeys and 'comment_num' in condiKeys and 'favorite_num' in condiKeys: #提取的键都有数据时,才提取,否则忽略掉此数据 if 50<float(info['detail_info']['price'])<1000: #设置价格区间,提取数据 j+=1 infoDic.append([info['location']['lat'],info['location']['lng'],info['detail_info']['price'],info['detail_info']['overall_rating'],info['detail_info']['service_rating'],info['detail_info']['facility_rating'],info['detail_info']['hygiene_rating'],info['detail_info']['image_num'],info['detail_info']['comment_num'],info['detail_info']['favorite_num'],info['detail_info']['checkin_num'],info['name']]) else:pass else:pass print('.....................................',j) data=np.array([(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7],v[8],v[9],v[10]) for v in infoDic],dtype='float') #解释变量(特征)数据部分 targetInfo=np.array([v[11] for v in infoDic]) #目标变量(类标)部分 dataBunch=base.Bunch(DESCR=r'info of poi',data=data,feature_names=['lat','lng','price','overall_rating','service_rating','facility_rating','hygiene_rating','image_num','comment_num','favorite_num','checkin_num'],target=targetInfo,target_names=['price','name']) #建立sklearn的数据存储格式bunch return dataBunch #返回bunch格式的数据
Example #3
Source File: poiStructure.py From python-urbanPlanning with MIT License | 6 votes |
def jsonDataFilter(fileInfo): #传入数据,面向不同的数据存储方式,需要调整函数内读取的代码 rootPath=list(fileInfo.keys()) #待读取数据文件的根目录 # print(rootPath) dataName=flatten_lst(list(fileInfo.values())) #待读取数据文件的文件名列表 # print(dataName) coodiDic=[] for fName in dataName: #逐一读取json数据格式文件,并将需要数据存储于列表中,本次实验数据为poi的经纬度信息和一级行业分类名,注意使用了百度坐标系,未转换为WGS84. f=open(os.path.join(rootPath[0],fName)) jsonDecodes=json.load(f) coodiDic.append([(coordi['location']['lat'],coordi['location']['lng'],fName[:-5]) for coordi in jsonDecodes]) coodiDic=flatten_lst(coodiDic) #读取的数据多层嵌套,需展平处理。 # print(coodiDic) data=np.array([(v[0],v[1]) for v in coodiDic]) #经纬度信息 targetNames=np.array([v[2] for v in coodiDic]) #一级分类 # print(data) # print(targetNames) class_label=LabelEncoder() #以整数形式编码一级分类名 targetLabel=class_label.fit_transform(targetNames) class_mapping=[(idx,label) for idx,label in enumerate(class_label.classes_)] #建立一级分类名和整数编码的映射列表 # print(class_mapping) dataBunch=base.Bunch(DESCR=r'spatial points datasets of poi',data=data,feature_names=["XCoordinate","yCoordinate"],target=targetLabel,target_names=class_mapping) #建立sklearn的数据存储格式bunch return dataBunch,class_mapping #返回bunch格式的数据和分类名映射列表
Example #4
Source File: utils.py From cpae with MIT License | 6 votes |
def _get_cluster_assignments(dataset_name, url, sep=" ", skip_header=False): data_dir = _get_dataset_dir("categorization", verbose=0) _fetch_file(url=url, data_dir=data_dir, uncompress=True, move="{0}/{0}.txt".format(dataset_name), verbose=0) files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt")) X = [] y = [] names = [] for cluster_id, file_name in enumerate(files): with open(file_name) as f: lines = f.read().splitlines()[(int(skip_header)):] X += [l.split(sep) for l in lines] y += [os.path.basename(file_name).split(".")[0]] * len(lines) return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object"))
Example #5
Source File: rasterPTSextraction_statistic_poi.py From python-urbanPlanning with MIT License | 6 votes |
def jsonDataFilter(fileInfo): #传入数据,面向不同的数据存储方式,需要调整函数内读取的代码 rootPath=list(fileInfo.keys()) #待读取数据文件的根目录 # print(rootPath) dataName=flatten_lst(list(fileInfo.values())) #待读取数据文件的文件名列表 # print(dataName) coodiDic=[] for fName in dataName: #逐一读取json数据格式文件,并将需要数据存储于列表中,本次实验数据为poi的经纬度信息和一级行业分类名,注意使用了百度坐标系,未转换为WGS84. f=open(os.path.join(rootPath[0],fName)) jsonDecodes=json.load(f) coodiDic.append([(coordi['location']['lat'],coordi['location']['lng'],fName[:-5]) for coordi in jsonDecodes]) coodiDic=flatten_lst(coodiDic) #读取的数据多层嵌套,需展平处理。 # print(coodiDic) data=np.array([(v[0],v[1]) for v in coodiDic]) #经纬度信息 targetNames=np.array([v[2] for v in coodiDic]) #一级分类 # print(data) # print(targetNames) class_label=LabelEncoder() #以整数形式编码一级分类名 targetLabel=class_label.fit_transform(targetNames) class_mapping=[(idx,label) for idx,label in enumerate(class_label.classes_)] #建立一级分类名和整数编码的映射列表 # print(class_mapping) dataBunch=base.Bunch(DESCR=r'spatial points datasets of poi',data=data,feature_names=["XCoordinate","yCoordinate"],target=targetLabel,target_names=class_mapping) #建立sklearn的数据存储格式bunch return dataBunch,class_mapping #返回bunch格式的数据和分类名映射列表
Example #6
Source File: utils.py From word-embeddings-benchmarks with MIT License | 6 votes |
def _get_cluster_assignments(dataset_name, url, sep=" ", skip_header=False): data_dir = _get_dataset_dir("categorization", verbose=0) _fetch_file(url=url, data_dir=data_dir, uncompress=True, move="{0}/{0}.txt".format(dataset_name), verbose=0) files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt")) X = [] y = [] names = [] for cluster_id, file_name in enumerate(files): with open(file_name) as f: lines = f.read().splitlines()[(int(skip_header)):] X += [l.split(sep) for l in lines] y += [os.path.basename(file_name).split(".")[0]] * len(lines) return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object"))
Example #7
Source File: categorization.py From word-embeddings-benchmarks with MIT License | 6 votes |
def fetch_BLESS(): """ Fetch Baroni and Marco categorization dataset Parameters ------- Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Baroni et al. "How we BLESSed distributional semantic evaluation", 2011 Notes ----- Data set includes 200 concrete nouns (100 animate and 100 inanimate nouns) from different classes (e.g., tools, clothing, vehicles, animals, etc.). """ return _get_cluster_assignments(dataset_name="EN-BLESS", url="https://www.dropbox.com/sh/5qbl5cmh17o3eh0/AACyCEqpMktdMI05zwphJRI7a?dl=1")
Example #8
Source File: categorization.py From cpae with MIT License | 6 votes |
def fetch_BLESS(): """ Fetch Baroni and Marco categorization dataset Parameters ------- Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Baroni et al. "How we BLESSed distributional semantic evaluation", 2011 Notes ----- Data set includes 200 concrete nouns (100 animate and 100 inanimate nouns) from different classes (e.g., tools, clothing, vehicles, animals, etc.). """ return _get_cluster_assignments(dataset_name="EN-BLESS", url="https://www.dropbox.com/sh/5qbl5cmh17o3eh0/AACyCEqpMktdMI05zwphJRI7a?dl=1")
Example #9
Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17(): bunch = Bunch(key='original') # This reproduces a problem when Bunch pickles have been created # with scikit-learn 0.16 and are read with 0.17. Basically there # is a surprising behaviour because reading bunch.key uses # bunch.__dict__ (which is non empty for 0.16 Bunch objects) # whereas assigning into bunch.key uses bunch.__setattr__. See # https://github.com/scikit-learn/scikit-learn/issues/6196 for # more details bunch.__dict__['key'] = 'set from __dict__' bunch_from_pkl = loads(dumps(bunch)) # After loading from pickle the __dict__ should have been ignored assert_equal(bunch_from_pkl.key, 'original') assert_equal(bunch_from_pkl['key'], 'original') # Making sure that changing the attr does change the value # associated with __getitem__ as well bunch_from_pkl.key = 'changed' assert_equal(bunch_from_pkl.key, 'changed') assert_equal(bunch_from_pkl['key'], 'changed')
Example #10
Source File: categorization.py From word-embeddings-benchmarks with MIT License | 6 votes |
def fetch_ESSLI_1a(): """ Fetch ESSLI 1a task categorization dataset. Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:concrete_nouns_categorization. Notes ----- The goal of the sub-task is to group concrete nouns into semantic categories. The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates). The nouns are included in the feature norms described in McRae et al. (2005) """ return _get_cluster_assignments(dataset_name="EN-ESSLI-1a", url="https://www.dropbox.com/sh/h362565r1sk5wii/AADjcdYy3nRo-MjuFUSvb-0ya?dl=1")
Example #11
Source File: base.py From sklearn-theano with BSD 3-Clause "New" or "Revised" License | 6 votes |
def load_images(filenames): """Load images for image manipulation. Parameters ---------- filenames : iterable Iterable of filename paths as strings Returns ------- data : Bunch Dictionary-like object with the following attributes : 'images', the sample images, 'filenames', the file names for the images """ # Load image data for each image in the source folder. images = [np.array(Image.open(filename, 'r')) for filename in filenames] return Bunch(images=images, filenames=filenames)
Example #12
Source File: asirra.py From sklearn-theano with BSD 3-Clause "New" or "Revised" License | 6 votes |
def fetch_asirra(image_count=1000): """ Parameters ---------- image_count : positive integer Returns ------- data : Bunch Dictionary-like object with the following attributes : 'images', the sample images, 'data', the flattened images, 'target', the label for the image (0 for cat, 1 for dog), and 'DESCR' the full description of the dataset. """ partial_path = check_fetch_asirra() m = Memory(cachedir=partial_path, compress=6, verbose=0) load_func = m.cache(_fetch_asirra) images, target = load_func(partial_path, image_count=image_count) return Bunch(data=images.reshape(len(images), -1), images=images, target=target, DESCR="Asirra cats and dogs dataset")
Example #13
Source File: categorization.py From cpae with MIT License | 6 votes |
def fetch_ESSLI_1a(): """ Fetch ESSLI 1a task categorization dataset. Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:concrete_nouns_categorization. Notes ----- The goal of the sub-task is to group concrete nouns into semantic categories. The data set consists of 44 concrete nouns, belonging to 6 semantic categories (four animates and two inanimates). The nouns are included in the feature norms described in McRae et al. (2005) """ return _get_cluster_assignments(dataset_name="EN-ESSLI-1a", url="https://www.dropbox.com/sh/h362565r1sk5wii/AADjcdYy3nRo-MjuFUSvb-0ya?dl=1")
Example #14
Source File: similarity.py From cpae with MIT License | 6 votes |
def fetch_SCWS(): """ Fetch SCWS dataset for testing similarity (with a context) Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with mean scores, 'sd': standard deviation of scores References ---------- Huang et al., "Improving Word Representations via Global Context and Multiple Word Prototypes", 2012 Notes ----- TODO """ data = _get_as_pd('https://www.dropbox.com/s/qgqj366lzzzj1ua/preproc_SCWS.txt?dl=1', 'similarity', header=None, sep="\t") X = data.values[:, 0:2].astype("object") mean = data.values[:,2].astype(np.float) sd = np.std(data.values[:, 3:14].astype(np.float), axis=1).flatten() return Bunch(X=X, y=mean,sd=sd)
Example #15
Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _make_path_events_file_spm_auditory_data(spm_auditory_data): """ Accepts data for spm_auditory dataset as Bunch and constructs the filepath for its events descriptor file. Parameters ---------- spm_auditory_data: Bunch Returns ------- events_filepath: string Full path to the events.tsv file for spm_auditory dataset. """ events_file_location = os.path.dirname(spm_auditory_data['func'][0]) events_filename = os.path.basename(events_file_location) + '_events.tsv' events_filepath = os.path.join(events_file_location, events_filename) return events_filepath
Example #16
Source File: test_base.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_bunch_pickle_generated_with_0_16_and_read_with_0_17(): bunch = Bunch(key='original') # This reproduces a problem when Bunch pickles have been created # with scikit-learn 0.16 and are read with 0.17. Basically there # is a suprising behaviour because reading bunch.key uses # bunch.__dict__ (which is non empty for 0.16 Bunch objects) # whereas assigning into bunch.key uses bunch.__setattr__. See # https://github.com/scikit-learn/scikit-learn/issues/6196 for # more details bunch.__dict__['key'] = 'set from __dict__' bunch_from_pkl = loads(dumps(bunch)) # After loading from pickle the __dict__ should have been ignored assert_equal(bunch_from_pkl.key, 'original') assert_equal(bunch_from_pkl['key'], 'original') # Making sure that changing the attr does change the value # associated with __getitem__ as well bunch_from_pkl.key = 'changed' assert_equal(bunch_from_pkl.key, 'changed') assert_equal(bunch_from_pkl['key'], 'changed')
Example #17
Source File: similarity.py From cpae with MIT License | 5 votes |
def fetch_SimVerb3500(which='all'): """ Fetch SimVerb3500 dataset for testing verb similarity Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, References ---------- Gerz, Daniela et al., "SimVerb-3500: A Large-Scale Evaluation Set of Verb Similarity", 2016 Notes ----- TODO """ if which not in ['all', 'dev', 'test']: raise RuntimeError("Not recognized which parameter") url_map = {"all": 'https://www.dropbox.com/s/xct7j3h7i9bzi7y/all_SimVerb3500.txt?dl=1', "dev": 'https://www.dropbox.com/s/57d850d6puxl6nm/dev_SimVerb3500.txt?dl=1', "test": 'https://www.dropbox.com/s/66hlkkhfa6c9lrt/test_SimVerb3500.txt?dl=1'} data = _get_as_pd(url_map[which], which, header=None, sep=" ") return Bunch(X=data.values[:, 0:2].astype("object"), y=data.values[:, 2:].astype(np.float))
Example #18
Source File: beamformers_electrodes_tweak.py From mmvt with GNU General Public License v3.0 | 5 votes |
def find_best_freqs_subset(event_id, bipolar, freqs_bins, from_t, to_t, time_split, combs, optimization_method='RidgeCV', optimization_params={}, k=3, gk_sigma=3, njobs=6): freqs_bins = sorted(freqs_bins) all_electrodes = get_all_electrodes_names(bipolar) elec_data = load_electrodes_data(event_id, bipolar, all_electrodes, from_t, to_t, subtract_min=False, normalize_data=False) meg_data_dic = load_all_dics(freqs_bins, event_id, bipolar, all_electrodes, from_t, to_t, gk_sigma, dont_calc_new_csd=True, njobs=njobs) uuid = utils.rand_letters(5) results_fol = get_results_fol(optimization_method) partial_results_fol = os.path.join(results_fol, 'best_freqs_subset_{}'.format(uuid)) utils.make_dir(results_fol) utils.make_dir(partial_results_fol) cond = utils.first_key(event_id) all_freqs_bins_subsets = list(utils.superset(freqs_bins)) random.shuffle(all_freqs_bins_subsets) N = len(all_freqs_bins_subsets) print('There are {} freqs subsets'.format(N)) all_freqs_bins_subsets_chunks = utils.chunks(all_freqs_bins_subsets, int(len(all_freqs_bins_subsets) / njobs)) params = [Bunch(event_id=event_id, bipolar=bipolar, freqs_bins_chunks=freqs_bins_subsets_chunk, cond=cond, from_t=from_t, to_t=to_t, freqs_bins=freqs_bins, partial_results_fol=partial_results_fol, time_split=time_split, only_sig_electrodes=False, only_from_same_lead=True, electrodes_positive=False, electrodes_normalize=False, gk_sigma=gk_sigma, k=k, do_plot_results=False, do_save_partial_results=False, optimization_params=optimization_params, check_only_pred_score=True, njobs=1, N=int(N / njobs), elec_data=elec_data, meg_data_dic=meg_data_dic, all_electrodes=all_electrodes, optimization_method=optimization_method, error_calc_method='rol_corr', error_threshold=30, combs=combs) for freqs_bins_subsets_chunk in all_freqs_bins_subsets_chunks] results = utils.run_parallel(_find_best_freqs_subset_parallel, params, njobs) all_results = [] for chunk_results in results: all_results.extend(chunk_results) params_suffix = utils.params_suffix(optimization_params) output_file = os.path.join(results_fol, 'best_freqs_subset_{}_{}_{}{}.pkl'.format(cond, uuid, k, params_suffix)) print('saving results to {}'.format(output_file)) utils.save((chunk_results, freqs_bins), output_file)
Example #19
Source File: utils.py From mmvt with GNU General Public License v3.0 | 5 votes |
def dic2bunch(dic): from sklearn.datasets.base import Bunch return Bunch(**dic)
Example #20
Source File: base.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _maybe_sklearn_data(data, target): if isinstance(data, Bunch): if target is not None: raise ValueError("'target' can't be specified for sklearn.datasets") # this should be first target = data.target # instanciate here to add column name columns = getattr(data, 'feature_names', None) data = pd.DataFrame(data.data, columns=columns) return data, target return data, target
Example #21
Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fetch_localizer_first_level(data_dir=None, verbose=1): """ Download a first-level localizer fMRI dataset Parameters ---------- data_dir: string directory where data should be downloaded and unpacked. Returns ------- data: sklearn.datasets.base.Bunch dictionary-like object, with the keys: epi_img: the input 4D image events: a csv file describing the paardigm """ url = 'https://osf.io/2bqxn/download' epi_img = 'sub-12069_task-localizer_space-MNI305.nii.gz' events = 'sub-12069_task-localizer_events.tsv' opts = {'uncompress': True} options = ('epi_img', 'events') dir_ = 'localizer_first_level' filenames = [(os.path.join(dir_, name), url, opts) for name in [epi_img, events]] dataset_name = 'localizer_first_level' data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(data_dir, filenames, verbose=verbose) params = dict(list(zip(options, files))) return Bunch(**params)
Example #22
Source File: categorization.py From word-embeddings-benchmarks with MIT License | 5 votes |
def fetch_battig(): """ Fetch 1969 Battig dataset Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment 'freq': frequency of response 'frequency': Kucera-Francis word frequency 'rank': rank of frequence within response 'rfreq': rated frequency References ---------- W.F Battig & W.E Montague (1968). Category norms for verbal items in 56 categories: A replication and extension of the Connecticut norms using University of Maryland and Illinois students (Tech. Rep.) University of Colorado, Boulder, CO (1968) Notes ----- This dataset comprises a ranked list of 5231 words listed in 56 taxonomic categories by people who were asked to list as many exemplars of a given category ("a precious stone", "a unit of time", "a fruit", "a color", etc.). Participants had 30s to generate as many responses to each category as possible, after which time the next category name was presented. Included in this dataset are all words from the Battig and Montague (1969) norms listed with freq > 1. This is not the same dataset as 'battig' in Baroni et al. "Don’t count, predict! A systematic comparison of context-counting vs. context-predicting semantic vectors" """ data = _get_cluster_assignments(dataset_name="EN-BATTIG", url="https://www.dropbox.com/sh/ckp4yu7k7xl7u2a/AABhmpgU3ake3T9liA9BR8EBa?dl=1", sep=",", skip_header=True) return Bunch(X=data.X[:, 0], y=data.y, freq=data.X[:, 1], frequency=data.X[:, 2], rank=data.X[:, 3], rfreq=data.X[:, 4])
Example #23
Source File: categorization.py From word-embeddings-benchmarks with MIT License | 5 votes |
def fetch_ESSLI_2b(): """ Fetch ESSLI 2c task categorization dataset Parameters ------- Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': words 'y': cluster assignment References ---------- Originally published at http://wordspace.collocations.de/doku.php/data:esslli2008:abstract_concrete_nouns_discrimination. Notes ----- The data set consists of 40 nouns extracted from the MRC Psycholinguistic Database, with ratings by human subjects on the concreteness scale. The nouns have been classified into three classes: HI, LO and ME being highly, low and medium abstract nouns. """ return _get_cluster_assignments(dataset_name="EN-ESSLI-2b", url="https://www.dropbox.com/sh/7gdv52gy9vb4mf2/AACExLgHdbvbBrRZBP6CcdDaa?dl=1")
Example #24
Source File: similarity.py From word-embeddings-benchmarks with MIT License | 5 votes |
def fetch_MTurk(): """ Fetch MTurk dataset for testing attributional similarity Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, References ---------- Radinsky, Kira et al., "A Word at a Time: Computing Word Relatedness Using Temporal Semantic Analysis", 2011 Notes ----- Human labeled examples of word semantic relatedness. The data pairs were generated using an algorithm as described in the paper by [K. Radinsky, E. Agichtein, E. Gabrilovich, S. Markovitch.]. Each pair of words was evaluated by 10 people on a scale of 1-5. Additionally scores were multiplied by factor of 2. """ data = _get_as_pd('https://www.dropbox.com/s/f1v4ve495mmd9pw/EN-TRUK.txt?dl=1', 'similarity', header=None, sep=" ").values return Bunch(X=data[:, 0:2].astype("object"), y=2 * data[:, 2].astype(np.float))
Example #25
Source File: similarity.py From word-embeddings-benchmarks with MIT License | 5 votes |
def fetch_RW(): """ Fetch Rare Words dataset for testing attributional similarity Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'sd': vector of std of scores References ---------- Published at http://www-nlp.stanford.edu/~lmthang/morphoNLM/. Notes ----- 2034 word pairs that are relatively rare with human similarity scores. Rare word selection: our choices of rare words (word1) are based on their frequencies – based on five bins (5, 10], (10, 100], (100, 1000], (1000, 10000], and the affixes they possess. To create a diverse set of candidates, we randomly select 15 words for each configuration (a frequency bin, an affix). At the scale of Wikipedia, a word with frequency of 1-5 is most likely a junk word, and even restricted to words with frequencies above five, there are still many non-English words. To counter such problems, each word selected is required to have a non-zero number of synsets in WordNet(Miller, 1995). """ data = _get_as_pd('https://www.dropbox.com/s/xhimnr51kcla62k/EN-RW.txt?dl=1', 'similarity', header=None, sep="\t").values return Bunch(X=data[:, 0:2].astype("object"), y=data[:, 2].astype(np.float), sd=np.std(data[:, 3:].astype(np.float)))
Example #26
Source File: similarity.py From word-embeddings-benchmarks with MIT License | 5 votes |
def fetch_SimLex999(): """ Fetch SimLex999 dataset for testing attributional similarity Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'sd': vector of sd of scores, 'conc': matrix with columns conc(w1), conc(w2) and concQ the from dataset 'POS': vector with POS tag 'assoc': matrix with columns denoting free association: Assoc(USF) and SimAssoc333 References ---------- Hill, Felix et al., "Simlex-999: Evaluating semantic models with (genuine) similarity estimation", 2014 Notes ----- SimLex-999 is a gold standard resource for the evaluation of models that learn the meaning of words and concepts. SimLex-999 provides a way of measuring how well models capture similarity, rather than relatedness or association. The scores in SimLex-999 therefore differ from other well-known evaluation datasets such as WordSim-353 (Finkelstein et al. 2002). The following two example pairs illustrate the difference - note that clothes are not similar to closets (different materials, function etc.), even though they are very much related: coast - shore 9.00 9.10, clothes - closet 1.96 8.00 """ data = _get_as_pd('https://www.dropbox.com/s/0jpa1x8vpmk3ych/EN-SIM999.txt?dl=1', 'similarity', sep="\t") # We basically select all the columns available X = data[['word1', 'word2']].values y = data['SimLex999'].values sd = data['SD(SimLex)'].values conc = data[['conc(w1)', 'conc(w2)', 'concQ']].values POS = data[['POS']].values assoc = data[['Assoc(USF)', 'SimAssoc333']].values return Bunch(X=X.astype("object"), y=y, sd=sd, conc=conc, POS=POS, assoc=assoc)
Example #27
Source File: similarity.py From word-embeddings-benchmarks with MIT License | 5 votes |
def fetch_TR9856(): """ Fetch TR9856 dataset for testing multi-word term relatedness Returns ------- data : sklearn.datasets.base.Bunch dictionary-like object. Keys of interest: 'X': matrix of 2 words per column, 'y': vector with scores, 'topic': vector of topics providing context for each pair of terms References ---------- Levy, Ran et al., "TR9856: A multi-word term relatedness benchmark", 2015. Notes ----- """ data = pd.read_csv(os.path.join(_fetch_file( 'https://www.research.ibm.com/haifa/dept/vst/files/IBM_Debater_(R)_TR9856.v2.zip', 'similarity', uncompress=True, verbose=0), 'IBM_Debater_(R)_TR9856.v0.2', 'TermRelatednessResults.csv'), encoding="iso-8859-1") # We basically select all the columns available X = data[['term1', 'term2']].values y = data['score'].values topic = data['topic'].values return Bunch(X=X.astype("object"), y=y, topic=topic)
Example #28
Source File: test_base.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_loads_dumps_bunch(): bunch = Bunch(x="x") bunch_from_pkl = loads(dumps(bunch)) bunch_from_pkl.x = "y" assert_equal(bunch_from_pkl['x'], bunch_from_pkl.x)
Example #29
Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fetch_spm_auditory(data_dir=None, data_name='spm_auditory', subject_id='sub001', verbose=1): """Function to fetch SPM auditory single-subject data. Parameters ---------- data_dir: string Path of the data directory. Used to force data storage in a specified location. If the data is already present there, then will simply glob it. Returns ------- data: sklearn.datasets.base.Bunch Dictionary-like object, the interest attributes are: - 'func': string list. Paths to functional images - 'anat': string list. Path to anat image References ---------- :download: http://www.fil.ion.ucl.ac.uk/spm/data/auditory/ """ data_dir = _get_dataset_dir(data_name, data_dir=data_dir, verbose=verbose) subject_dir = os.path.join(data_dir, subject_id) if not os.path.exists(subject_dir): _download_spm_auditory_data(data_dir, subject_dir, subject_id) spm_auditory_data = _prepare_downloaded_spm_auditory_data(subject_dir) try: spm_auditory_data['events'] except KeyError: events_filepath = _make_path_events_file_spm_auditory_data( spm_auditory_data) if not os.path.isfile(events_filepath): _make_events_file_spm_auditory_data(events_filepath) spm_auditory_data['events'] = events_filepath return spm_auditory_data
Example #30
Source File: datasets.py From nistats with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _glob_spm_multimodal_fmri_data(subject_dir): """glob data from subject_dir.""" _subject_data = {'slice_order': 'descending'} for session in range(1, 3): # glob func data for session _subject_data = _get_func_data_spm_multimodal(subject_dir, session, _subject_data) if not _subject_data: return None # glob trials .mat file _subject_data = _get_session_trials_spm_multimodal(subject_dir, session, _subject_data) if not _subject_data: return None try: events = _make_events_file_spm_multimodal_fmri(_subject_data, session) except MatReadError as mat_err: warnings.warn( '{}. An events.tsv file ' 'cannot be generated'.format(str(mat_err))) else: events_filepath = _make_events_filepath_spm_multimodal_fmri( _subject_data, session) events.to_csv(events_filepath, sep='\t', index=False) _subject_data['events{}'.format(session)] = events_filepath # glob for anat data _subject_data = _get_anatomical_data_spm_multimodal(subject_dir, _subject_data) if not _subject_data: return None return Bunch(**_subject_data)