Python keras.preprocessing() Examples

The following are code examples for showing how to use keras.preprocessing(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: Sacred_Deep_Learning   Author: AAbercrombie0492   File: evaluate_model.py    GNU General Public License v3.0 6 votes vote down vote up
def test_images_generator(test_path):
    '''
    Creates a generator that pulls images from a test directory that contains
    shade vs sunny subdirectories.
    '''
    from keras.utils.np_utils import to_categorical
    from keras.preprocessing import image
    from keras.preprocessing.image import ImageDataGenerator
    from keras.applications.resnet50 import preprocess_input
    from sklearn.model_selection import train_test_split
    from image_utilities import load_images_from_directory, preprocess_input_resnet
    import numpy as np

    #load_images from from the train and val directories
    test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input_resnet)
    test_generator = test_datagen.flow_from_directory(directory=test_path,
                                                target_size=[224, 224],
                                                batch_size=26,
                                                class_mode='categorical')

    return test_datagen, test_generator 
Example 2
Project: GraphicDesignPatternByPython   Author: Relph1119   File: autogen.py    MIT License 5 votes vote down vote up
def clean_module_name(name):
    if name.startswith('keras_applications'):
        name = name.replace('keras_applications', 'keras.applications')
    if name.startswith('keras_preprocessing'):
        name = name.replace('keras_preprocessing', 'keras.preprocessing')
    assert name[:6] == 'keras.', 'Invalid module name: %s' % name
    return name 
Example 3
Project: keylogger   Author: sameetandpotatoes   File: user.py    MIT License 5 votes vote down vote up
def set_tags(self):
        """
            Run a convolutional neural network on the server-side per image
            to associate tags (doesn't take more than a few seconds), and
            runs in parallel since thread is started per client request
        """
        import numpy as np
        import keras
        from keras.preprocessing import image
        from keras_squeezenet import SqueezeNet
        from keras.applications.imagenet_utils import preprocess_input, decode_predictions
        # dtype must be uint8
        # Encode image to ascii for Python 3 compatibility
        server_buffer = np.frombuffer(base64.decodestring(self.image.encode('ascii')), dtype="uint8")
        server_frame = cv2.imdecode(server_buffer, cv2.IMREAD_UNCHANGED)

        keras.backend.clear_session()
        model = SqueezeNet()
        resized = cv2.resize(server_frame, dsize=(227, 227), interpolation=cv2.INTER_CUBIC)
        x = image.img_to_array(resized)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        preds = decode_predictions(model.predict(x))
        # preds is an array containing an array of tags and scores
        print("Tags: {}".format(preds[0]))
        for id, pred, score in preds[0]:
            self.tags.append(pred) 
Example 4
Project: ecir2019-qac   Author: jantrienes   File: baseline_cnn_cv.py    MIT License 5 votes vote down vote up
def preprocess(docs):
    padded = []
    for doc in docs:
        padded.append(preprocessing.pad_sequence(doc, max_length=SEQUENCE_LENGTH))
    return padded 
Example 5
Project: ecir2019-qac   Author: jantrienes   File: baseline_cnn_cv.py    MIT License 5 votes vote down vote up
def evaluate_model(run_dir, model, X, y_true, config, embedding_weights, set_name):
    gen = CNNStaticGenerator(X, y_true, config['batch_size'], embedding_weights, return_y=False)

    y_pred_proba = model.predict_generator(gen, use_multiprocessing=MULTIPROCESSING,
                                           workers=WORKERS, max_queue_size=MAX_QUEUE_SIZE)[:, 0]
    y_pred = (y_pred_proba >= 0.5).astype(int)
    class_names = preprocessing.LABEL_ENCODER.classes_
    scores = evaluation._evaluate(y_true, y_pred, y_pred_proba, class_names)
    file_name = '{}_results.csv'.format(set_name)
    pd.DataFrame(scores, index=[0]).to_csv(join(run_dir, file_name), index=False)
    return y_pred, y_pred_proba 
Example 6
Project: sears   Author: marcotcr   File: fasttext.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def fit(self, X, Y, ngram_range=1, max_features=20000, maxlen=400,
            batch_size=32, embedding_dims=50, epochs=5):
        self.tokenizer = keras.preprocessing.text.Tokenizer(
            num_words=max_features, split=" ", char_level=False)
        self.tokenizer.fit_on_texts(X)
        x_train = self.tokenizer.texts_to_sequences(X)
        self.ngram_range = ngram_range
        self.maxlen = maxlen
        self.add_ngrams = lambda x: x
        if ngram_range > 1:
            ngram_set = set()
            for input_list in x_train:
                for i in range(2, ngram_range + 1):
                    set_of_ngram = create_ngram_set(input_list, ngram_value=i)
                    ngram_set.update(set_of_ngram)

            # Dictionary mapping n-gram token to a unique integer.
            # Integer values are greater than max_features in order
            # to avoid collision with existing features.
            start_index = max_features + 1
            self.token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
            indice_token = {self.token_indice[k]: k for k in self.token_indice}

            # max_features is the highest integer that could be found in the dataset.
            max_features = np.max(list(indice_token.keys())) + 1
            self.add_ngrams = lambda x: add_ngram(x, self.token_indice,
                                                  self.ngram_range)
            x_train = self.add_ngrams(x_train)
            print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
        x_train = sequence.pad_sequences(x_train, maxlen=self.maxlen)
        self.model = Sequential()

        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        self.model.add(Embedding(max_features,
                                 embedding_dims,
                                 input_length=self.maxlen))

        # we add a GlobalAveragePooling1D, which will average the embeddings
        # of all words in the document
        self.model.add(GlobalAveragePooling1D())

        # We project onto a single unit output layer, and squash via sigmoid:
        self.model.add(Dense(1, activation='sigmoid'))

        self.model.compile(loss='binary_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])
        self.model.fit(x_train, Y, batch_size=batch_size, epochs=epochs, verbose=2) 
Example 7
Project: sears   Author: marcotcr   File: fasttext.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def fit(self, X, Y, max_features=20000, maxlen=400,
            batch_size=32, hidden_dims=250, filters=250, kernel_size=3,
            epochs=5):
        from keras.preprocessing import sequence
        from keras.models import Sequential
        from keras.layers import Dense, Dropout, Activation
        from keras.layers import Embedding
        from keras.layers import Conv1D, GlobalMaxPooling1D
        self.tokenizer = keras.preprocessing.text.Tokenizer(
            num_words=max_features, split=" ", char_level=False)
        self.tokenizer.fit_on_texts(X)
        x_train = self.tokenizer.texts_to_sequences(X)
        self.maxlen = maxlen
        embeddings = get_most_common_embeddings(self.tokenizer, self.nlp)
        x_train = sequence.pad_sequences(x_train, maxlen=self.maxlen)
        self.model = Sequential()
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        self.model.add(
            Embedding(
                embeddings.shape[0],
                embeddings.shape[1],
                input_length=maxlen,
                trainable=False,
                weights=[embeddings]
            )
        )

        self.model.add(Dropout(0.2))

        # we add a Convolution1D, which will learn filters
        # word group filters of size filter_length:
        self.model.add(Conv1D(filters, kernel_size, padding='valid',
                              activation='relu', strides=1))
        # we use max pooling:
        self.model.add(GlobalMaxPooling1D())

        # We add a vanilla hidden layer:
        self.model.add(Dense(hidden_dims))
        self.model.add(Dropout(0.2))
        self.model.add(Activation('relu'))
        # We project onto a single unit output layer, and squash it with a sigmoid:
        self.model.add(Dense(1))
        # model.add(Dense(3))
        self.model.add(Activation('sigmoid'))



        # optimizer = keras.optimizers.Adam(lr=0.001)
        optimizer = keras.optimizers.Adam(lr=0.0001)
        # model.compile(loss='categorical_crossentropy',
        #               optimizer=optimizer,
        #               metrics=['accuracy'])
        self.model.compile(loss='binary_crossentropy',
                           optimizer=optimizer,
                           metrics=['accuracy'])

        self.model.fit(x_train, Y, batch_size=batch_size, epochs=epochs, verbose=2) 
Example 8
Project: ecir2019-qac   Author: jantrienes   File: baseline_cnn_cv.py    MIT License 4 votes vote down vote up
def load_data(community_name):
    dataset = namedtuple('dataset', ['X_train', 'y_train', 'train_ids', 'X_val', 'y_val',
                                     'val_ids', 'X_test', 'y_test', 'test_ids', 'vocabulary_inv'])

    if community_name == 'CNN_DEBUG':
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=SEQUENCE_LENGTH,
                                         padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=SEQUENCE_LENGTH,
                                        padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"

        train_ids = np.array(range(0, 10))
        val_ids = np.array(range(10, 20))
        test_ids = np.array(range(20, 30))
        return dataset(x_train[:10], y_train[:10], train_ids,
                       x_test[:10], y_test[:10], val_ids,
                       x_test[10:20], y_test[10:20], test_ids,
                       vocabulary_inv)

    X_train, y_train, train_ids, X_val, y_val, val_ids, X_test, y_test, test_ids = preprocessing \
        .load_community(community_name, preprocess=True, min_df=3, with_dev=True)

    logger.info('Pad documents...')
    X_train = preprocess(X_train)
    X_val = preprocess(X_val)
    X_test = preprocess(X_test)

    logger.info('Build vocabulary...')
    vocabulary, vocabulary_inv = preprocessing.build_vocab(X_train + X_val + X_test)
    logger.info('Map vocabulary...')
    X_train = preprocessing.map_vocabulary(X_train, vocabulary)
    X_val = preprocessing.map_vocabulary(X_val, vocabulary)
    X_test = preprocessing.map_vocabulary(X_test, vocabulary)

    return dataset(X_train, y_train, train_ids,
                   X_val, y_val, val_ids,
                   X_test, y_test, test_ids,
                   vocabulary_inv) 
Example 9
Project: ecir2019-qac   Author: jantrienes   File: baseline_cnn_cv.py    MIT License 4 votes vote down vote up
def main(args, metadata_dir, execution_id):
    cnn_util.save_config(metadata_dir, PREPROCESSING_PARAMS, 'preprocessing-params')

    # Data Preparation
    logger.info("Load data...")
    dataset = load_data(args.community)
    X_train, X_val, X_test = dataset.X_train, dataset.X_val, dataset.X_test
    vocabulary_inv = dataset.vocabulary_inv

    logger.info("X_train shape: %s", X_train.shape)
    logger.info("X_val shape: %s", X_val.shape)
    logger.info("X_test shape: %s", X_test.shape)
    logger.info("Vocabulary Size: %d", len(vocabulary_inv))

    embedding_weights = w2v.train_word2vec(np.vstack((X_train, X_val, X_test)), vocabulary_inv,
                                           args.community, num_features=EMBEDDING_DIM,
                                           min_word_count=MIN_WORD_COUNT, context=CONTEXT)

    grid = list(ParameterGrid(MODEL_PARAMS))
    run_names = ['run_{}'.format(i) for i in range(len(grid))]

    failed_runs = []
    logger.info('Start testing "%d" configurations', len(grid))
    for config, run_name in zip(grid, run_names):
        try:
            logger.info('%s...', run_name)
            run_dir = join(metadata_dir, run_name)
            os.makedirs(run_dir, exist_ok=True)
            cnn_util.save_config(run_dir, config)

            make_run(run_dir, config, dataset, embedding_weights, run_name)
        except Exception as e:
            logging.exception('Exception during run %s', run_name)
            failed_runs.append(run_name)

    if failed_runs:
        logger.warning('Failed runs: %s', str(failed_runs))
        run_names = [run for run in run_names if run not in failed_runs]

    if run_names:
        cnn_util.consolidate_runs(args, metadata_dir, run_names, execution_id)
    else:
        logger.error('All runs failed!') 
Example 10
Project: Tegu   Author: generalized-intelligence   File: API.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def serval_to_dict(self, anno_path, data_path, max_box=128):
        '''Extract annotation info and do preprocessing'''
        contents = ''
        self.gt_boxes = {}      #for metrices
        
        _, contents = decrypt_txt_file(anno_path)
        lines = contents.split('\n')
        self.class_count = len(lines[1].split(','))
        self.class_names = lines[1].split(',')
        self.class_names = [cn.split(':')[1] for cn in self.class_names]

        gt = {}
        annos = lines[2:]
        for anno in annos:
            self.gt_boxes_per_img = np.zeros((max_box,5),dtype='int32')     #for matrics
            x = anno.split(':')
            if(len(x)==1) or x[1]=='':
                continue
            name = x[0]
            img = cv2.imread(os.path.join(data_path,name))
            try:
                width = img.shape[0]
                height = img.shape[1]
            except:
                continue
            x = list(map(lambda a:int(float(a)), x[1].split(',')))
            i=0
            boxes = []                        
            while len(x)>0 and i<max_box:
                box = []
                self.gt_boxes_per_img[i] = [x[1],x[2],x[3],x[4],x[0]]       #for metrics
                label = np.zeros(self.class_count)
                label[x[0]] = float(1)
                xmin = float(x[1])/width
                ymin = float(x[2])/height
                xmax = float(x[3])/width
                ymax = float(x[4])/height
                box = [xmin, ymin, xmax, ymax]
                box.extend(label[1:])
                boxes.append(box)
                x = x[5:]
                i+=1
            self.gt_boxes[name] = self.gt_boxes_per_img     #for metrics
            boxes = np.asarray(boxes,dtype='float64')
            gt[name] = boxes
        return gt     #{'name':[[x1,y1,x2,y2,0,0,1]]}