Python Code Examples for load data

60 Python code examples are found related to "load data". These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Project: Image-Caption-Generator   Author: dabasajay   File: load_data.py    License: MIT License 6 votes vote down vote up
def loadTrainData(config):
	train_image_ids = load_set(config['train_data_path'])
	# Check if we already have preprocessed data saved and if not, preprocess the data.
	# Create and save 'captions.txt' & features.pkl
	preprocessData(config)
	# Load captions
	train_captions, _count = load_cleaned_captions(config['model_data_path']+'captions.txt', train_image_ids)
	# Load image features
	train_image_features = load_image_features(config['model_data_path']+'features_'+str(config['model_type'])+'.pkl', train_image_ids)
	print('{}: Available images for training: {}'.format(mytime(),len(train_image_features)))
	print('{}: Available captions for training: {}'.format(mytime(),_count))
	if not os.path.exists(config['model_data_path']+'tokenizer.pkl'):
		# Prepare tokenizer
		tokenizer = create_tokenizer(train_captions)
		# Save the tokenizer
		dump(tokenizer, open(config['model_data_path']+'tokenizer.pkl', 'wb'))
	# Determine the maximum sequence length
	max_length = calc_max_length(train_captions)
	return train_image_features, train_captions, max_length 
Example 2
Project: argus-freesound   Author: lRomul   File: utils.py    License: MIT License 6 votes vote down vote up
def load_folds_data(use_corrections=True):
    if use_corrections:
        with open(config.corrections_json_path) as file:
            corrections = json.load(file)
        print("Corrections:", corrections)
        pkl_name = f'{config.audio.get_hash(corrections=corrections)}.pkl'
    else:
        corrections = None
        pkl_name = f'{config.audio.get_hash()}.pkl'

    folds_data_pkl_path = config.folds_data_pkl_dir / pkl_name

    if folds_data_pkl_path.exists():
        folds_data = pickle_load(folds_data_pkl_path)
    else:
        folds_data = get_folds_data(corrections)
        if not config.folds_data_pkl_dir.exists():
            config.folds_data_pkl_dir.mkdir(parents=True, exist_ok=True)
        pickle_save(folds_data, folds_data_pkl_path)
    return folds_data 
Example 3
Project: DetectAndTrack   Author: facebookresearch   File: eval_helpers.py    License: Apache License 2.0 6 votes vote down vote up
def load_data(argv):

  dataDir = get_data_dir()

  gt_file, pred_file, mode = process_arguments(argv)
  gtFilename = dataDir + gt_file
  predFilename = dataDir + pred_file

  # load ground truth (GT)
  with open(gtFilename) as data_file:
      data = json.load(data_file)
  gtFramesAll = data

  # load predictions
  with open(predFilename) as data_file:
      data = json.load(data_file)
  prFramesAll = data

  return gtFramesAll, prFramesAll 
Example 4
Project: pancanatlas_code_public   Author: ratschlab   File: manhattan_sqtl.py    License: MIT License 6 votes vote down vote up
def load_data():
    trans_df = pd.read_csv(os.path.join(_DATA_DIR, 'transdataresults_test_v3.tsv'), sep='\t')
    if trans_df.columns[0].startswith('#'):
        cols = trans_df.columns.tolist()
        cols[0] = cols[0].replace('# ', '')
        trans_df.columns = cols
    trans_df = trans_df.set_index('gene-id(event)')
    trans_df['is_trans'] = True

    cis_df = pd.read_csv(os.path.join(_DATA_DIR, 'cisdataresults_test_v3.tsv'), sep='\t')
    if cis_df.columns[0].startswith('#'):
        cols = cis_df.columns.tolist()
        cols[0] = cols[0].replace('# ', '')
        cis_df.columns = cols
    cis_df = cis_df.set_index('gene-id(event)')
    cis_df['is_trans'] = False

    assert cis_df.columns.equals(trans_df.columns)
    data = pd.concat((trans_df, cis_df))
    return data 
Example 5
Project: cccatalog   Author: creativecommons   File: sql.py    License: MIT License 6 votes vote down vote up
def load_s3_data_to_intermediate_table(
        postgres_conn_id,
        bucket,
        s3_key,
        identifier
):
    load_table = _get_load_table_name(identifier)
    logger.info(f'Loading {s3_key} from S3 Bucket {bucket} into {load_table}')

    postgres = PostgresHook(postgres_conn_id=postgres_conn_id)
    postgres.run(
        dedent(
            f"""
            SELECT aws_s3.table_import_from_s3(
              '{load_table}',
              '',
              'DELIMITER E''\t''',
              '{bucket}',
              '{s3_key}',
              'us-east-1'
            );
            """
        )
    )
    _clean_intermediate_table_data(postgres, load_table) 
Example 6
Project: tartarus   Author: sergiooramas   File: load_w2v.py    License: MIT License 6 votes vote down vote up
def load_data():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    train_index = open(common.DATASETS_DIR + "/items_index_train_%s.tsv" % (DATASET_NAME)).read().splitlines()
    val_index = open(common.DATASETS_DIR + "/items_index_val_%s.tsv" % (DATASET_NAME)).read().splitlines()
    test_index = open(common.DATASETS_DIR + "/items_index_test_%s.tsv" % (DATASET_NAME)).read().splitlines()

    documents, sentences = load_data_set(train_index)
    documents_padded = pad_sentences(documents)
    vocabulary, vocabulary_inv = build_vocab(documents_padded)
    x_train = build_input_data(documents_padded, vocabulary)

    documents, _ = load_data_set(val_index)
    documents_padded = pad_sentences(documents)
    x_val = build_input_data(documents_padded, vocabulary)

    documents, _ = load_data_set(test_index)
    documents_padded = pad_sentences(documents)
    x_test = build_input_data(documents_padded, vocabulary)

    return [x_train, x_val, x_test, vocabulary, vocabulary_inv, sentences] 
Example 7
Project: deep_learning_ex   Author: zatonovo   File: cnn_mnist.py    License: MIT License 6 votes vote down vote up
def load_data():
    print 'Loading data...'
    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')

    X_train /= 255
    X_test /= 255

    y_train = np_utils.to_categorical(y_train, 10)
    y_test = np_utils.to_categorical(y_test, 10)

    X_train = np.reshape(X_train, (60000, 1, 28,28))
    X_test = np.reshape(X_test, (10000, 1, 28,28))

    print 'Data loaded'
    return [X_train, X_test, y_train, y_test] 
Example 8
Project: fine-lm   Author: akzaidi   File: inference.py    License: MIT License 6 votes vote down vote up
def load_data(input_file, input_vocab):
  """Returns an iterator over the input file.

  Args:
    input_file: The input text file.
    input_vocab: The input vocabulary.

  Returns:
    A dataset batch iterator.
  """
  dataset = tf.data.TextLineDataset(input_file)
  dataset = dataset.map(lambda x: tf.string_split([x]).values)
  dataset = dataset.map(input_vocab.lookup)
  dataset = dataset.map(lambda x: {
      "ids": x,
      "length": tf.shape(x)[0]})
  dataset = dataset.padded_batch(64, {
      "ids": [None],
      "length": []})
  return dataset.make_initializable_iterator() 
Example 9
Project: d2l-zh   Author: d2l-ai   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join(
        '~', '.mxnet', 'datasets', 'fashion-mnist')):
    """Download the fashion mnist dataset and then load into memory."""
    root = os.path.expanduser(root)
    transformer = []
    if resize:
        transformer += [gdata.vision.transforms.Resize(resize)]
    transformer += [gdata.vision.transforms.ToTensor()]
    transformer = gdata.vision.transforms.Compose(transformer)

    mnist_train = gdata.vision.FashionMNIST(root=root, train=True)
    mnist_test = gdata.vision.FashionMNIST(root=root, train=False)
    num_workers = 0 if sys.platform.startswith('win32') else 4

    train_iter = gdata.DataLoader(mnist_train.transform_first(transformer),
                                  batch_size, shuffle=True,
                                  num_workers=num_workers)
    test_iter = gdata.DataLoader(mnist_test.transform_first(transformer),
                                 batch_size, shuffle=False,
                                 num_workers=num_workers)
    return train_iter, test_iter 
Example 10
Project: vq-vae   Author: Kyubyong   File: data_load.py    License: Apache License 2.0 6 votes vote down vote up
def load_data(mode="train"):
    '''Loads data
    Args:
      mode: "train" or "eval".

    Returns:
      files: A list of sound file paths.
      speaker_ids: A list of speaker ids.
    '''
    if mode=="train":
        wavs = glob.glob('/data/private/speech/vctk/wavs/*.npy')
        # wavs = glob.glob('vctk/wavs/*.npy')
        qts = [wav.replace("wavs", "qts") for wav in wavs]
        speakers = np.array([speaker2id(os.path.basename(wav)[:4]) for wav in wavs], np.int32)

        return wavs, qts, speakers
    else: # test. two samples.
        files = ['/data/private/speech/vctk/qts/'+line.split("|")[0].strip() + ".npy" for line in hp.test_data.splitlines()]
        speaker_ids = [int(line.split("|")[1]) for line in hp.test_data.splitlines()]
        return files, speaker_ids

# load_data() 
Example 11
Project: CloudComputing   Author: devdattakulkarni   File: dynamodb_handler.py    License: Apache License 2.0 6 votes vote down vote up
def create_and_load_data(self, tableName, fileName):
        # TODO - This function should create a table named <tableName> 
        # and load data from the file named <fileName>


    def dispatch(self, command_string):
        # TODO - This function takes in as input a string command (e.g. 'insert_movie')
        # the return value of the function should depend on the command
        # For commands 'insert_movie', 'delete_movie', 'update_movie', delete_table' :
        #       return the message as a string that is expected as the output of the command
        # For commands 'search_movie_actor', 'search_movie_actor_director', print_stats' :
        #       return the a list of json objects where each json object has only the required
        #       keys and attributes of the expected result items.

        # Note: You should not print anything to the command line in this function.
        response = None

        return response 
Example 12
Project: Benchmarks   Author: ECP-CANDLE   File: reg_go2.py    License: MIT License 6 votes vote down vote up
def load_data():

    data_path = args['in']
        
    df = (pd.read_csv(data_path,skiprows=1).values).astype('float32')

    df_y = df[:,0].astype('float32')
    df_x = df[:, 1:PL].astype(np.float32)


#    scaler = MaxAbsScaler()
        
    scaler = StandardScaler()
    df_x = scaler.fit_transform(df_x)
        
    X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, test_size= 0.20, random_state=42)
    
    print('x_train shape:', X_train.shape)
    print('x_test shape:', X_test.shape)

    
    return X_train, Y_train, X_test, Y_test 
Example 13
Project: relation-autoencoder   Author: diegma   File: OieEvaluation.py    License: Apache License 2.0 6 votes vote down vote up
def loadData(pickled_dataset):

    if not os.path.exists(pickled_dataset):
        print "Pickled dataset not found"
        sys.exit()

    pklFile = open(pickled_dataset, 'rb')

    featureExtrs = pickle.load(pklFile)

    relationLexicon = pickle.load(pklFile)

    data = pickle.load(pklFile)

    goldStandard = pickle.load(pklFile)

    pklFile.close()


    return goldStandard 
Example 14
Project: ReadableWebProxy   Author: fake-name   File: rss_views.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def feedLoadFilteredData(feedid):

	print("Loading data")
	releases = g.session.query(db.RssFeedPost)                                                       \
		.filter(db.RssFeedPost.feed_id == feedid)                                                 \
		.filter(db.RssFeedPost.published > datetime.datetime.now() - datetime.timedelta(days=30)) \
		.all()

	if len(releases) < 10:
		# If there were no releases within the window, fetch ALL THE THINGS.
		releases = g.session.query(db.RssFeedPost)                                                       \
			.filter(db.RssFeedPost.feed_id == feedid)                                                 \
			.all()

	# .join(db.RssFeedEntry.releases)                                                          \

	print("Loaded. Procesing.")
	items = proto_process_releases(releases)

	return render_template('rss-pages/feed_items_processed_block.html',
						   items         = items,
						   release_count = len(releases),
						   ) 
Example 15
Project: dissemin   Author: dissemin   File: conftest.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def load_test_data(request, db, django_db_setup, django_db_blocker):
    with django_db_blocker.unblock():
        call_command('loaddata', 'test_dump.json')
        self = request.cls
        self.i = Institution.objects.get(name='ENS')
        self.d = Department.objects.get(name='Chemistry dept')
        self.di = Department.objects.get(name='Comp sci dept')

        self.r1 = get_researcher_by_name('Isabelle', 'Aujard')
        self.r2 = get_researcher_by_name('Ludovic', 'Jullien')
        self.r3 = get_researcher_by_name('Antoine', 'Amarilli')
        self.r4 = get_researcher_by_name('Antonin', 'Delpeuch')
        self.r5 = get_researcher_by_name('Terence', 'Tao')
        self.hal = OaiSource.objects.get(identifier='hal')
        self.arxiv = OaiSource.objects.get(identifier='arxiv')
        self.lncs = Journal.objects.get(issn='0302-9743')
        self.acm = Journal.objects.get(issn='1529-3785').publisher 
Example 16
Project: pancanatlas_code_public   Author: ratschlab   File: sf_heatmap.py    License: MIT License 6 votes vote down vote up
def load_gtex_data(sf_interest):
    #if RESET_GTEX_CACHE: print "WARNING: Resetting gtex cache"
    sf_names = sf_interest.keys()
    ensg_interest = sf_interest.values()

    map_event_to_file = {'exon_skip': config.alt_splice_exon_skip_gtex_path,
                     'intron_retention': config.alt_splice_intron_retention_gtex_path,
                     'alt_3prime': config.alt_splce_alt_3prime_gtex_path,
                     'alt_5prime': config.alt_splce_alt_5prime_gtex_path}

    cache_path = os.path.join(CACHE_DIR, 'altsplice_interest_sf_gtex.tsv')
    cache_sf_interest_path = os.path.join(CACHE_DIR, 'altsplice_interest_sf_gtex.names')
    if False: #not RESET_GTEX_CACHE and os.path.exists(cache_path):
        assert _check_sf_interest_names(sf_names, cache_sf_interest_path)
        df = pd.read_csv(cache_path, sep='\t', index_col=0)
    else:
        # Load all data to impute nans
        # Only keep columns with genes of interest (cols describe events within gene)
        # Return intersection across all events
        df_dict = dict()
        for etype, path in map_event_to_file.items():
            df, col_idx= _load_single_hdf5(path, ensg_interest)
            df.columns = map(lambda x: etype + '.' + x, df.columns)
            df_dict[etype] = (df, col_idx)
    return df_dict 
Example 17
Project: CNNC   Author: xiaoyeye   File: train_with_labels_wholedatax.py    License: MIT License 6 votes vote down vote up
def load_data_TF2(indel_list,data_path): # cell type specific  ## random samples for reactome is not enough, need borrow some from keggp
    import random
    import numpy as np
    xxdata_list = []
    yydata = []
    count_set = [0]
    count_setx = 0
    for i in indel_list:#len(h_tf_sc)):
        xdata = np.load(data_path+'/Nxdata_tf' + str(i) + '.npy')
        ydata = np.load(data_path+'/ydata_tf' + str(i) + '.npy')
        for k in range(len(ydata)):
            xxdata_list.append(xdata[k,:,:,:])
            yydata.append(ydata[k])
        count_setx = count_setx + len(ydata)
        count_set.append(count_setx)
        print (i,len(ydata))
    yydata_array = np.array(yydata)
    yydata_x = yydata_array.astype('int')
    print (np.array(xxdata_list).shape)
    return((np.array(xxdata_list),yydata_x,count_set)) 
Example 18
Project: GGP   Author: yincheng   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def load_data_ssl(data_name):
    adj_csr, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(data_name)
    adj_mat = np.asarray(adj_csr.toarray(), dtype=np_float_type)
    x_tr = np.reshape(np.arange(len(train_mask))[train_mask], (-1, 1))
    x_val = np.reshape(np.arange(len(val_mask))[val_mask], (-1, 1))
    x_test = np.reshape(np.arange(len(test_mask))[test_mask], (-1, 1))
    y_tr = np.asarray(y_train[train_mask], dtype=np.int32)
    y_tr = np.reshape(np.sum(np.tile(np.arange(y_tr.shape[1]), (np.sum(train_mask), 1)) * y_tr, axis=1), (-1, 1))
    y_val = np.asarray(y_val[val_mask], dtype=np.int32)
    y_val = np.reshape(np.sum(np.tile(np.arange(y_val.shape[1]), (np.sum(val_mask), 1)) * y_val, axis=1), (-1, 1))
    y_test = np.asarray(y_test[test_mask], dtype=np.int32)
    y_test = np.reshape(np.sum(np.tile(np.arange(y_test.shape[1]), (np.sum(test_mask), 1)) * y_test, axis=1), (-1, 1))
    node_features = features.toarray()
    if data_name.lower() != 'pubmed': #pubmed already comes with tf-idf
        transformer = TfidfTransformer(smooth_idf=True)
        node_features = transformer.fit_transform(node_features).toarray()
    return adj_mat, node_features, x_tr, y_tr, x_val, y_val, x_test, y_test 
Example 19
Project: CAPTCHA-breaking   Author: lllcho   File: cifar10.py    License: MIT License 6 votes vote down vote up
def load_data():
    dirname = "cifar-10-batches-py"
    origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
    path = get_file(dirname, origin=origin, untar=True)

    nb_test_samples = 10000
    nb_train_samples = 50000

    X_train = np.zeros((nb_train_samples, 3, 32, 32), dtype="uint8")
    y_train = np.zeros((nb_train_samples,), dtype="uint8")

    for i in range(1, 6):
        fpath = os.path.join(path, 'data_batch_' + str(i))
        data, labels = load_batch(fpath)
        X_train[(i-1)*10000:i*10000, :, :, :] = data
        y_train[(i-1)*10000:i*10000] = labels

    fpath = os.path.join(path, 'test_batch')
    X_test, y_test = load_batch(fpath)

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    return (X_train, y_train), (X_test, y_test) 
Example 20
Project: mmskeleton   Author: open-mmlab   File: skeleton_feeder.py    License: Apache License 2.0 6 votes vote down vote up
def load_data(self, mmap):
        # data: N C V T M

        # load label
        with open(self.label_path, 'rb') as f:
            self.sample_name, self.label = pickle.load(f)

        # load data
        if mmap:
            self.data = np.load(self.data_path, mmap_mode='r')
        else:
            self.data = np.load(self.data_path)

        if self.debug:
            self.label = self.label[0:100]
            self.data = self.data[0:100]
            self.sample_name = self.sample_name[0:100]

        self.N, self.C, self.T, self.V, self.M = self.data.shape 
Example 21
Project: incubator-superset   Author: apache   File: birth_names.py    License: Apache License 2.0 6 votes vote down vote up
def load_data(tbl_name: str, database: Database) -> None:
    pdf = pd.read_json(get_example_data("birth_names.json.gz"))
    pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
    pdf.to_sql(
        tbl_name,
        database.get_sqla_engine(),
        if_exists="replace",
        chunksize=500,
        dtype={
            "ds": DateTime,
            "gender": String(16),
            "state": String(10),
            "name": String(255),
        },
        index=False,
    )
    print("Done loading table!")
    print("-" * 80) 
Example 22
Project: UWGAN_UIE   Author: infrontofme   File: utils.py    License: MIT License 6 votes vote down vote up
def load_data(self):

        """
        load training image

        :return: trainA and trainB path
        """

        trainA_paths = np.asarray(glob.glob(self.trainA_path))
        trainA_paths.sort()
        # print(trainA_paths[233])
        trainB_paths = np.asarray(glob.glob(self.trainB_path))
        trainB_paths.sort()
        # print(trainB_paths[233])

        print(len(trainB_paths), 'training images')

        return trainA_paths, trainB_paths 
Example 23
Project: plonk   Author: dmentipl   File: _phantom_evolution.py    License: MIT License 6 votes vote down vote up
def load_data_from_file(
    filenames: Union[str, Path, Tuple[str], Tuple[Path], List[str], List[Path]],
):
    """Load data from Phantom .ev files."""
    if isinstance(filenames, (str, Path)):
        _filenames = [filenames]
    elif isinstance(filenames, (list, tuple)):
        _filenames = list(filenames)
    else:
        raise ValueError('filenames is not a known type')

    _file_paths = list()
    for filename in _filenames:
        path = Path(filename)
        _file_paths.append(path.resolve())
    file_paths = tuple(_file_paths)

    _check_file_consistency(file_paths, NAME_MAP)
    columns = _get_columns(file_paths[0], NAME_MAP)
    dataframe = _get_data(columns, file_paths)

    return dataframe 
Example 24
Project: ASR_WORD   Author: zw76859420   File: readdata_02.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def load_data_list(self):
        if self.type == 'train':
            filename_wavlist_thchs30 = 'datalist' + self.slash + 'train.wav.lst'
            filename_wordlist_thchs30 = 'datalist' + self.slash + 'train.word.txt'
        elif self.type == 'dev':
            filename_wavlist_thchs30 = 'datalist' + self.slash + 'cv.wav.lst'
            filename_wordlist_thchs30 = 'datalist' + self.slash + 'cv.word.txt'
        elif self.type == 'test':
            filename_wavlist_thchs30 = 'datalist' + self.slash + 'test.wav.lst'
            filename_wordlist_thchs30 = 'datalist' + self.slash + 'test.word.txt'
        else:
            pass
        self.dic_wavlist , self.list_wavnum = get_wav_list(self.datapath + filename_wavlist_thchs30)
        self.dic_textlist , self.list_textnum = get_wav_text(self.datapath + filename_wordlist_thchs30)
        self.datanum = self.get_data_num()
        pass 
Example 25
Project: chainer   Author: chainer   File: seq2seq.py    License: MIT License 6 votes vote down vote up
def load_data_using_dataset_api(
        src_vocab, src_path, target_vocab, target_path, filter_func):

    def _transform_line(vocabulary, line):
        words = line.strip().split()
        return numpy.array(
            [vocabulary.get(w, UNK) for w in words], numpy.int32)

    def _transform(example):
        source, target = example
        return (
            _transform_line(src_vocab, source),
            _transform_line(target_vocab, target)
        )

    return chainer.datasets.TransformDataset(
        chainer.datasets.TextDataset(
            [src_path, target_path],
            encoding='utf-8',
            filter_func=filter_func
        ), _transform) 
Example 26
Project: LDG   Author: uoguelph-mlrg   File: social_data_loader.py    License: Educational Community License v2.0 6 votes vote down vote up
def load_data(data_dir, prob, dump=True):
        data_file = pjoin(data_dir, 'data_prob%s.pkl' % prob)
        if os.path.isfile(data_file):
            print('loading data from %s' % data_file)
            with open(data_file, 'rb') as f:
                data = pickle.load(f)
        else:
            data = {'initial_embeddings': SubjectsReader(pjoin(data_dir, 'Subjects.csv')).features_onehot}
            for split in ['train', 'test']:
                data.update(
                    {split: SocialEvolution(data_dir, split=split, MIN_EVENT_PROB=prob)})
            if dump:
                # dump data files to avoid their generation again
                print('saving data to %s' % data_file)
                with open(data_file, 'wb') as f:
                    pickle.dump(data, f, protocol=2)  # for compatibility
        return data 
Example 27
def load_data(data_path, max_len=200):
    data = []
    l = []
    ids = []
    i = 0
    l_encoder = LabelEncoder()
    with open(data_path, 'rb') as inf:
        for line in inf:
            gzip_fields = line.decode('utf-8').split('\t')
            gzip_id = gzip_fields[0]
            gzip_label = gzip_fields[1]
            elmo_embd_str = gzip_fields[4].strip()
            elmo_embd_list = ast.literal_eval(elmo_embd_str)
            elmo_embd_array = np.array(elmo_embd_list)
            padded_seq = sequence.pad_sequences([elmo_embd_array], maxlen=max_len, dtype='float32')[0]
            data.append(padded_seq)
            l.append(gzip_label)
            ids.append(gzip_id)
            i += 1
            print(i)
    label = l_encoder.fit_transform(l)
    return np.array(data), np.array(label), np.array(ids) 
Example 28
Project: NER-BERT-pytorch   Author: lemonhu   File: data_loader.py    License: MIT License 6 votes vote down vote up
def load_data(self, data_type):
        """Loads the data for each type in types from data_dir.

        Args:
            data_type: (str) has one of 'train', 'val', 'test' depending on which data is required.
        Returns:
            data: (dict) contains the data with tags for each type in types.
        """
        data = {}
        
        if data_type in ['train', 'val', 'test']:
            sentences_file = os.path.join(self.data_dir, data_type, 'sentences.txt')
            tags_path = os.path.join(self.data_dir, data_type, 'tags.txt')
            self.load_sentences_tags(sentences_file, tags_path, data)
        else:
            raise ValueError("data type not in ['train', 'val', 'test']")
        return data 
Example 29
Project: medical-diagnosis-cnn-rnn-rcnn   Author: baiyyang   File: data_helpers.py    License: Apache License 2.0 6 votes vote down vote up
def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Return split sentences and labels.
    :param positive_data_file:
    :param negative_data_file:
    :return:
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, 'r', encoding='utf-8').readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, 'r', encoding='utf-8').readlines())
    negative_examples = [s.strip() for s in negative_examples]

    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]

    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y] 
Example 30
Project: ImageEnhancer   Author: CongBao   File: enhancer_gan.py    License: MIT License 6 votes vote down vote up
def load_data(self, process=False):
        """ load image data and initialize train, validation, test set
            :param process: whether the data is to be processed by trained model, default False
        """
        if self.corrupt_type == 'ZIP':
            row, col, channel = self.img_shape
            self.shape['in'] = tuple([int(row / 2), int(col / 2), channel])
        else:
            self.shape['in'] = self.img_shape
        self.shape['out'] = self.img_shape
        if process:
            self.source['process'] = load_img(self.img_dir, self.img_shape, ratio=None)
            self.source['process'] = self.source['process'].astype('float32') / 255
            return
        self.source['train'], self.source['valid'], self.source['test'] = load_img(self.img_dir, self.img_shape)
        print('Preprocessing data...')
        self.source['train'] = self.source['train'].astype('float32') / 255
        self.source['valid'] = self.source['valid'].astype('float32') / 255
        self.source['test'] = self.source['test'].astype('float32') / 255
        self.corrupted['train'] = self._corrupt(self.source['train'])
        self.corrupted['valid'] = self._corrupt(self.source['valid'])
        self.corrupted['test'] = self._corrupt(self.source['test']) 
Example 31
Project: perception   Author: BerkeleyAutomation   File: image.py    License: Apache License 2.0 6 votes vote down vote up
def load_data(filename):
        """Loads a data matrix from a given file.

        Parameters
        ----------
        filename : :obj:`str`
            The file to load the data from. Must be one of .png, .jpg,
            .npy, or .npz.

        Returns
        -------
        :obj:`numpy.ndarray`
            The data array read from the file.
        """
        file_root, file_ext = os.path.splitext(filename)
        data = None
        if file_ext.lower() in COLOR_IMAGE_EXTS:
            data = cv2.cvtColor(cv2.imread(filename), cv2.COLOR_BGR2RGB)
        elif file_ext == '.npy':
            data = np.load(filename)
        elif file_ext == '.npz':
            data = np.load(filename)['arr_0']
        else:
            raise ValueError('Extension %s not supported' % (file_ext))
        return data 
Example 32
Project: 3DChromatin_ReplicateQC   Author: kundajelab   File: plot_quasar_transform.py    License: MIT License 6 votes vote down vote up
def load_data(infile, chroms, resolutions):
    starts = infile['starts'][...]
    chromosomes = infile['chromosomes'][...]
    data = {}
    for res in resolutions:
        data[res] = {}
        for i, chrom in enumerate(chromosomes):
            if chrom not in chroms:
                continue
            start = (starts[i] / res) * res
            dist = infile['dist.%s.%i' % (chrom, res)][...]
            valid_rows = infile['valid.%s.%i' % (chrom, res)][...]
            corr = infile['corr.%s.%i' % (chrom, res)][...]
            valid = numpy.zeros(corr.shape, dtype=numpy.bool)
            N, M = corr.shape
            valid = numpy.zeros((N, M), dtype=numpy.int32)
            for i in range(min(N - 1, M)):
                P = N - i - 1
                valid[:P, i] = valid_rows[(i + 1):] * valid_rows[:P]
            temp = corr * dist
            valid[numpy.where(numpy.abs(temp) == numpy.inf)] = False
            data[res][chrom] = [start, temp, valid]
    return data 
Example 33
Project: dgl   Author: dmlc   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def load_training_data(f_name):
    print('We are loading data from:', f_name)
    edge_data_by_type = dict()
    all_nodes = list()
    with open(f_name, 'r') as f:
        for line in f:
            words = line[:-1].split(' ')  # line[-1] == '\n'
            if words[0] not in edge_data_by_type:
                edge_data_by_type[words[0]] = list()
            x, y = words[1], words[2]
            edge_data_by_type[words[0]].append((x, y))
            all_nodes.append(x)
            all_nodes.append(y)
    all_nodes = list(set(all_nodes))
    print('Total training nodes: ' + str(len(all_nodes)))
    return edge_data_by_type


# for each line, the data is [edge_type, node, node, true_or_false] 
Example 34
Project: video-to-pose3D   Author: zh-plus   File: processor_base.py    License: MIT License 6 votes vote down vote up
def load_data(self):
        Feeder = import_class(self.arg.feeder)
        if 'debug' not in self.arg.train_feeder_args:
            self.arg.train_feeder_args['debug'] = self.arg.debug
        self.data_loader = dict()
        if self.arg.phase == 'train':
            self.data_loader['train'] = torch.utils.data.DataLoader(
                dataset=Feeder(**self.arg.train_feeder_args),
                batch_size=self.arg.batch_size,
                shuffle=True,
                num_workers=self.arg.num_worker * torchlight.ngpu(
                    self.arg.device),
                drop_last=True)
        if self.arg.test_feeder_args:
            self.data_loader['test'] = torch.utils.data.DataLoader(
                dataset=Feeder(**self.arg.test_feeder_args),
                batch_size=self.arg.test_batch_size,
                shuffle=False,
                num_workers=self.arg.num_worker * torchlight.ngpu(
                    self.arg.device)) 
Example 35
Project: hamaa   Author: monitor1379   File: datasets.py    License: GNU General Public License v3.0 6 votes vote down vote up
def load_mnist_data(nb_training, nb_test, preprocess=False, flatten=True, one_hot=True):
    # 自动检查数据,如果数据文件不存在则会先自动下载
    download_mnist_data()
    training_x = mnist_decoder.load_train_images(num_data=nb_training)
    training_y = mnist_decoder.load_train_labels(num_data=nb_training)
    test_x = mnist_decoder.load_test_images(num_data=nb_test)
    test_y = mnist_decoder.load_test_labels(num_data=nb_test)

    if preprocess:
        training_x /= 255.
        test_x /= 255.

    if flatten:
        training_x = training_x.reshape(training_x.shape[0], 784)
        test_x = test_x.reshape(test_x.shape[0], 784)
    else:
        training_x = training_x.reshape((training_x.shape[0], 1, training_x.shape[1], training_x.shape[2]))
        test_x = test_x.reshape((test_x.shape[0], 1, test_x.shape[1], test_x.shape[2]))

    if one_hot:
        training_y = np_utils.to_one_hot(training_y, 10)
        test_y = np_utils.to_one_hot(test_y, 10)

    return (training_x, training_y), (test_x, test_y) 
Example 36
Project: saliency   Author: alexanderkroner   File: data.py    License: MIT License 6 votes vote down vote up
def load_data(self):
        train_list_x = _get_file_list(self._dir_stimuli_train)
        train_list_y = _get_file_list(self._dir_saliency_train)

        _check_consistency(zip(train_list_x, train_list_y), 10000)

        train_set = _fetch_dataset((train_list_x, train_list_y),
                                   self._target_size, True)

        valid_list_x = _get_file_list(self._dir_stimuli_valid)
        valid_list_y = _get_file_list(self._dir_saliency_valid)

        _check_consistency(zip(valid_list_x, valid_list_y), 5000)

        valid_set = _fetch_dataset((valid_list_x, valid_list_y),
                                   self._target_size, False)

        return (train_set, valid_set) 
Example 37
Project: mlens   Author: flennerhag   File: mnist.py    License: MIT License 6 votes vote down vote up
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    safe_print("Loading dataset...")
    data = fetch_mldata('MNIST original')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255

    # Create train-test split (as [Joachims, 2006])
    safe_print("Creating train-test split...")
    n_train = 60000
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    return X_train, X_test, y_train, y_test 
Example 38
Project: cs224d   Author: bogatyy   File: q3_RNNLM.py    License: MIT License 6 votes vote down vote up
def load_data(self, debug=False):
    """Loads starter word-vectors and train/dev/test data."""
    self.vocab = Vocab()
    self.vocab.construct(get_ptb_dataset('train'))
    self.encoded_train = np.array(
        [self.vocab.encode(word) for word in get_ptb_dataset('train')],
        dtype=np.int32)
    self.encoded_valid = np.array(
        [self.vocab.encode(word) for word in get_ptb_dataset('valid')],
        dtype=np.int32)
    self.encoded_test = np.array(
        [self.vocab.encode(word) for word in get_ptb_dataset('test')],
        dtype=np.int32)
    if debug:
      num_debug = 1024
      self.encoded_train = self.encoded_train[:num_debug]
      self.encoded_valid = self.encoded_valid[:num_debug]
      self.encoded_test = self.encoded_test[:num_debug] 
Example 39
Project: autodeepnet   Author: autodeepnet   File: data_utils.py    License: MIT License 6 votes vote down vote up
def load_data(file_path, load_format='hdf5', **kwargs):
    if 'key' not in kwargs and load_format == 'hdf5':
        kwargs['key'] = None
    if load_format != 'csv' and 'pandas_format' not in kwargs:
        kwargs['pandas_format'] = True
    if 'mode' not in kwargs:
        if load_format == 'pickle':
            kwargs['mode'] = 'rb'
        elif load_format == 'hdf5':
            kwargs['mode'] = 'r'
    logger.info("Attempting to load data from {}...".format(file_path))
    if not os.path.isfile(file_path):
        logger.error("File {} does not exist".format(file_path))
    loader = {
        'hdf5': load_hdf5_data,
        'csv': load_csv_data,
        'pickle': load_pickle_data
    }
    try:
        return loader.get(load_format, load_hdf5_data)(file_path, **kwargs)
    except Exception as e:
        logger.exception("Error loading file {}".format(file_path))
        raise exceptions.FileLoadError 
Example 40
Project: ArkPlanner   Author: ycremar   File: MaterialPlanning.py    License: MIT License 6 votes vote down vote up
def load_data(path_stats, path_rules):
    """
    To load stats and rules data from local directories.
    Args:
        path_stats: string. local path to the stats data.
        path_rules: string. local path to the composing rules data.
    Returns:
        material_probs: dictionary. Content of the stats json file.
        convertion_rules: dictionary. Content of the rules json file.
    """
    with open(path_stats) as json_file:
        material_probs  = json.load(json_file)
    with open(path_rules) as json_file:
        convertion_rules  = json.load(json_file)

    return material_probs, convertion_rules 
Example 41
Project: keras-lambda   Author: sunilmallya   File: mnist.py    License: MIT License 6 votes vote down vote up
def load_data(path='mnist.npz'):
    """Loads the MNIST dataset.

    # Arguments
        path: path where to cache the dataset locally
            (relative to ~/.keras/datasets).

    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    path = get_file(path, origin='https://s3.amazonaws.com/img-datasets/mnist.npz')
    f = np.load(path)
    x_train, y_train = f['x_train'], f['y_train']
    x_test, y_test = f['x_test'], f['y_test']
    f.close()
    return (x_train, y_train), (x_test, y_test) 
Example 42
Project: gokart   Author: m3dev   File: task.py    License: MIT License 6 votes vote down vote up
def load_data_frame(self,
                        target: Union[None, str, TargetOnKart] = None,
                        required_columns: Optional[Set[str]] = None,
                        drop_columns: bool = False) -> pd.DataFrame:
        def _flatten_recursively(dfs):
            if isinstance(dfs, list):
                return pd.concat([_flatten_recursively(df) for df in dfs])
            else:
                return dfs
        data = _flatten_recursively(self.load(target=target))

        required_columns = required_columns or set()
        if data.empty and len(data.index) == 0:
            return pd.DataFrame(columns=required_columns)
        assert required_columns.issubset(set(data.columns)), f'data must have columns {required_columns}, but actually have only {data.columns}.'
        if drop_columns:
            data = data[required_columns]
        return data 
Example 43
Project: BiDAF   Author: jojonki   File: process_data.py    License: Apache License 2.0 6 votes vote down vote up
def load_processed_data(fpath):
    ctx_max_len = 0 # character level length
    with open(fpath) as f:
        lines = f.readlines()
        data = []
        for l in lines:
            c_label, c, q, a, a_txt = l.rstrip().split('\t')
            if len(c) > ctx_max_len:
                ctx_max_len = len(c)
            c, q, a = c.split(' '), q.split(' '), a.split(' ')
            # if len(c) > 30: continue # TMP
            c, q = lower_list(c), lower_list(q)
            cc = [list(w) for w in c]
            qc = [list(w) for w in q]
            a = [int(aa) for aa in a]
            a = [a[0], a[-1]]
            data.append((c_label, c, cc, q, qc, a, a_txt))
    return data, ctx_max_len 
Example 44
Project: SAR.AI   Author: Avikalp7   File: train.py    License: MIT License 6 votes vote down vote up
def load_training_data(modified):
    """Loads saved training data from numpy matrix files

    :param modified: Boolean value to indicate if modified MLPH is used
    :return: training set features, test set features, training set target
        labels, test set target labels
    """

    file_name_1 = './output_data/X_train.npy' \
        if not modified else './output_data/modified/X_train.npy'
    file_name_2 = './output_data/y_train.npy' \
        if not modified else './output_data/modified/y_train.npy'

    assert (
        os.path.exists(file_name_1) and os.path.exists(file_name_2)), \
        'Error: run_mlph option off and no saved training data found.'

    X_train = np.load(file_name_1)
    y_train = np.load(file_name_2)
    y_train = list(y_train)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                        test_size=0.20,
                                                        random_state=42)

    return X_train, X_test, y_train, y_test 
Example 45
Project: figurefirst   Author: FlyRanch   File: regenerate.py    License: MIT License 6 votes vote down vote up
def load_data_file(filename):
    if filename[-4:] == ".svg":
        data_filename = filename.split(".svg")[0] + "_data.dillpickle"
        print("Automatically finding data file: " + data_filename)
    else:
        data_filename = filename

    if os.path.exists(data_filename):
        f = open(data_filename, "rb")
        data = pickle.load(f)
        f.close()
    else:
        print("No data file: " + data_filename)
        data = None

    return data 
Example 46
Project: professional-services   Author: GoogleCloudPlatform   File: util.py    License: Apache License 2.0 6 votes vote down vote up
def load_main_energy_data(project_id, gs_path):
  """Load main energy data from the specified file.

  Load main energy data from the specified file.

  Args:
    project_id: string, GCP project id.
    gs_path: string, path to the data file.
  Returns:
    pd.DataFrame, main energy data.
  """
  with gcs.open(gs_path) as f:
    data = pd.read_csv(f,
                       delimiter=' ',
                       header=None,
                       names=['time',
                              'main_watts',
                              'main_va',
                              'main_RMS'])
  data.time = data.time.apply(lambda x: datetime.fromtimestamp(x))
  data.set_index('time', drop=True, inplace=True)
  data.index = data.index.floor('S')
  return data 
Example 47
Project: multiNLI   Author: nyu-mll   File: data_processing.py    License: MIT License 6 votes vote down vote up
def load_nli_data_genre(path, genre, snli=True):
    """
    Load a specific genre's examples from MultiNLI, or load SNLI data and assign a "snli" genre to the examples.
    If the "snli" parameter is set to True, a genre label of snli will be assigned to the data. If set to true, it will overwrite the genre label for MultiNLI data.
    """
    data = []
    j = 0
    with open(path) as f:
        for line in f:
            loaded_example = json.loads(line)
            if loaded_example["gold_label"] not in LABEL_MAP:
                continue
            loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]]
            if snli:
                loaded_example["genre"] = "snli"
            if loaded_example["genre"] == genre:
                data.append(loaded_example)
        random.seed(1)
        random.shuffle(data)
    return data 
Example 48
Project: hypers   Author: priyankshah7   File: _hsiDialog.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def load_data(self):
        if self._X is None:
            self.slider.setEnabled(False)

        else:
            self.shape = self._X.shape
            self.dimensions = self._X.ndim
            # self.slider.setMaximum(self.shape[2]-1)

            if self.dimensions == 3:
                self.slider.setEnabled(False)
                self.data_image(self._X)
                self.data_spectrum(self._X)

            elif self.dimensions == 4:
                self.slider.setValue(0)
                self.slider.setMaximum(self.shape[2]-1)
                self.data_image(self._X[:, :, 0, :])
                self.data_spectrum(self._X[:, :, 0, :]) 
Example 49
Project: yolo_v2   Author: rky0930   File: wiki_data.py    License: Apache License 2.0 6 votes vote down vote up
def load_annotated_data(self, in_file):
    self.annotated_examples = {}
    self.annotated_tables = {}
    f = tf.gfile.GFile(in_file, "r")
    counter = 0
    for line in f:
      if (counter > 0):
        line = line.strip()
        (question_id, utterance, context, target_value, tokens, lemma_tokens,
         pos_tags, ner_tags, ner_values, target_canon) = line.split("\t")
        question = self.pre_process_sentence(tokens, ner_tags, ner_values)
        target_canon = target_canon.split("|")
        self.annotated_examples[question_id] = WikiExample(
            question_id, question, target_canon, context)
        self.annotated_tables[context] = []
      counter += 1
    print "Annotated examples loaded ", len(self.annotated_examples)
    f.close() 
Example 50
Project: multiNLI   Author: nyu-mll   File: data_processing.py    License: MIT License 6 votes vote down vote up
def load_nli_data(path, snli=False):
    """
    Load MultiNLI or SNLI data.
    If the "snli" parameter is set to True, a genre label of snli will be assigned to the data. 
    """
    data = []
    with open(path) as f:
        for line in f:
            loaded_example = json.loads(line)
            if loaded_example["gold_label"] not in LABEL_MAP:
                continue
            loaded_example["label"] = LABEL_MAP[loaded_example["gold_label"]]
            if snli:
                loaded_example["genre"] = "snli"
            data.append(loaded_example)
        random.seed(1)
        random.shuffle(data)
    return data 
Example 51
Project: autodeepnet   Author: autodeepnet   File: data_utils.py    License: MIT License 6 votes vote down vote up
def load_hdf5_data(file_path, **kwargs):
    key = kwargs.get('key', None)
    pandas_format = kwargs.get('pandas_format', True)
    mode = kwargs.get('mode', 'r')
    logger.info("Opening HDF5 file {} to read...".format(file_path))
    try:
        if pandas_format:
            data = pd.read_hdf(file_path, key=key, mode=mode)
        else:
            with h5py.File(file_path, mode) as f:
                data = f[key][()]
    except KeyError as e:
        logger.exception("Dataset {} does not exist".format(dataset))
        raise exceptions.FileLoadError("Dataset does not exist")
    except Exception as e:
        logger.exception("Problem loading dataset: {0}".format(e))
        raise exceptions.FileLoadError
    logger.info("Successfully loaded HDF5 data")
    return data 
Example 52
Project: d2l-zh   Author: d2l-ai   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def load_data_pikachu(batch_size, edge_size=256):
    """Download the pikachu dataest and then load into memory."""
    data_dir = '../data/pikachu'
    _download_pikachu(data_dir)
    train_iter = image.ImageDetIter(
        path_imgrec=os.path.join(data_dir, 'train.rec'),
        path_imgidx=os.path.join(data_dir, 'train.idx'),
        batch_size=batch_size,
        data_shape=(3, edge_size, edge_size),
        shuffle=True,
        rand_crop=1,
        min_object_covered=0.95,
        max_attempts=200)
    val_iter = image.ImageDetIter(
        path_imgrec=os.path.join(data_dir, 'val.rec'),
        batch_size=batch_size,
        data_shape=(3, edge_size, edge_size),
        shuffle=False)
    return train_iter, val_iter 
Example 53
Project: CDSS   Author: HealthRex   File: STARROrderMedConversion.py    License: GNU General Public License v3.0 6 votes vote down vote up
def loadRXCUIData(self):
        """Load up the full contents of the stride_mapped_meds table into
        memory (only a few thousand records) to facilitate rapid lookup resolution
        of common medication ingredient data.
        """
        rxcuiDataByMedId = dict()

        query = \
            """select medication_id, rxcui, active_ingredient
            from starr_datalake2018.mapped_meds
            """

        query_job = self.bqClient.queryBQ(query, verbose=True)

        for row in query_job:  # API request - fetches results
            (medId, rxcui, ingredient) = row    # Unpack the data tuple
            if medId not in rxcuiDataByMedId:
                rxcuiDataByMedId[medId] = dict()
            rxcuiDataByMedId[medId][rxcui] = ingredient

        return rxcuiDataByMedId 
Example 54
Project: pde-surrogate   Author: cics-nd   File: load.py    License: MIT License 6 votes vote down vote up
def load_data(hdf5_file, ndata, batch_size, only_input=True, return_stats=False):
    with h5py.File(hdf5_file, 'r') as f:
        x_data = f['input'][:ndata]
        print(f'x_data: {x_data.shape}')    
        if not only_input:
            y_data = f['output'][:ndata]
            print(f'y_data: {y_data.shape}')    

    stats = {}
    if return_stats:
        y_variation = ((y_data - y_data.mean(0, keepdims=True)) ** 2).sum(
            axis=(0, 2, 3))
        stats['y_variation'] = y_variation
    
    data_tuple = (torch.FloatTensor(x_data), ) if only_input else (
            torch.FloatTensor(x_data), torch.FloatTensor(y_data))
    data_loader = DataLoader(TensorDataset(*data_tuple),
        batch_size=batch_size, shuffle=True, drop_last=True)
    print(f'Loaded dataset: {hdf5_file}')
    return data_loader, stats 
Example 55
Project: nlp_toolkit   Author: stevewyl   File: utilities.py    License: MIT License 6 votes vote down vote up
def load_tc_data(fname, label_prefix='__label__', max_tokens_per_doc=256):

    def gen():
        with open(fname, 'r', encoding='utf8') as fin:
            for line in fin:
                words = line.strip().split()
                if words:
                    nb_labels = 0
                    label_line = []
                    for word in words:
                        if word.startswith(label_prefix):
                            nb_labels += 1
                            label = word.replace(label_prefix, "")
                            label_line.append(label)
                        else:
                            break
                    text = words[nb_labels:]
                    if len(text) > max_tokens_per_doc:
                        text = text[:max_tokens_per_doc]
                    yield (text, label_line)

    texts, labels = zip(*[item for item in gen()])
    return texts, labels 
Example 56
Project: cnn_finetune   Author: flyyufelix   File: load_cifar10.py    License: MIT License 6 votes vote down vote up
def load_cifar10_data(img_rows, img_cols):

    # Load cifar10 training and validation sets
    (X_train, Y_train), (X_valid, Y_valid) = cifar10.load_data()

    # Resize trainging images
    if K.image_dim_ordering() == 'th':
        X_train = np.array([cv2.resize(img.transpose(1,2,0), (img_rows,img_cols)).transpose(2,0,1) for img in X_train[:nb_train_samples,:,:,:]])
        X_valid = np.array([cv2.resize(img.transpose(1,2,0), (img_rows,img_cols)).transpose(2,0,1) for img in X_valid[:nb_valid_samples,:,:,:]])
    else:
        X_train = np.array([cv2.resize(img, (img_rows,img_cols)) for img in X_train[:nb_train_samples,:,:,:]])
        X_valid = np.array([cv2.resize(img, (img_rows,img_cols)) for img in X_valid[:nb_valid_samples,:,:,:]])

    # Transform targets to keras compatible format
    Y_train = np_utils.to_categorical(Y_train[:nb_train_samples], num_classes)
    Y_valid = np_utils.to_categorical(Y_valid[:nb_valid_samples], num_classes)

    return X_train, Y_train, X_valid, Y_valid 
Example 57
Project: RecVAE   Author: ilya-shenbin   File: utils.py    License: Apache License 2.0 6 votes vote down vote up
def load_tr_te_data(csv_file_tr, csv_file_te, n_items, n_users, global_indexing=False):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    if global_indexing:
        start_idx = 0
        end_idx = len(unique_uid) - 1
    else:
        start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
        end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                             (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te 
Example 58
Project: document-ocr   Author: rockyzhengwu   File: book_data.py    License: Apache License 2.0 6 votes vote down vote up
def load_label_data(label_file_path):
  tree = ET.parse(label_file_path)  
  root = tree.getroot()
  label_data = {}
  image_list= root.findall('image')
  text_list= root.findall('text')
  label_data['image_path']=root.get("image_path")
  page_height = int(root.get('height'))
  page_width = int(root.get('width'))
  label_data['width'] =  page_width
  label_data['height'] = page_height
  label_data['images'] = []
  label_data['texts'] = []
  for image in image_list:
    box = get_item_box(image)
    label_data['images'].append(box)
  for text in text_list:
    box = get_item_box(text)
    label_data['texts'].append(box)
  return label_data 
Example 59
Project: TransDG   Author: siat-nlp   File: data_loader.py    License: MIT License 6 votes vote down vote up
def load_data(data_dir, is_train=False):
    data_train, data_dev, data_test = [], [], []
    if is_train:
        with open('%s/train.txt' % data_dir) as f:
            for idx, line in enumerate(f):
                data_train.append(json.loads(line))
                if idx > 0 and idx % 100000 == 0:
                    print('read train file line %d' % idx)
        with open('%s/valid.txt' % data_dir) as f:
            for line in f:
                data_dev.append(json.loads(line))
    else:
        with open('%s/test.txt' % data_dir) as f:
            for line in f:
                data_test.append(json.loads(line))
    if is_train:
        return data_train, data_dev
    else:
        return data_test 
Example 60
Project: MyTwitterBot   Author: felipessalvatore   File: DataHolder.py    License: MIT License 6 votes vote down vote up
def load_data(self, debug):
        """
        Loads starter word-vectors and train/dev/test data.

        :type debug: boolean
        """
        self.vocab = Vocab()
        self.vocab.read_words(self.read_line_eos_noums(self.path_train))
        self.encoded_train = np.array(
            [self.vocab.encode(word)
             for word in self.read_line_eos_noums(self.path_train)],
            dtype=np.int32)
        self.encoded_valid = np.array(
            [self.vocab.encode(word)
             for word in self.read_line_eos_noums(self.path_valid)],
            dtype=np.int32)
        self.encoded_test = np.array(
            [self.vocab.encode(word)
             for word in self.read_line_eos_noums(self.path_test)],
            dtype=np.int32)
        if debug:
            num_debug = 1024
            self.encoded_train = self.encoded_train[:num_debug]
            self.encoded_valid = self.encoded_valid[:num_debug]
            self.encoded_test = self.encoded_test[:num_debug]