Python faiss.IndexFlatIP() Examples

The following are 15 code examples of faiss.IndexFlatIP(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module faiss , or try the search function .
Example #1
Source File: faiss.py    From gntp with MIT License 7 votes vote down vote up
def _build_approximate_index(self,
                                     data: np.ndarray):
            dimensionality = data.shape[1]
            nlist = 100 if data.shape[0] > 100 else 2

            if self.kernel_name in {'rbf'}:
                quantizer = faiss.IndexFlatL2(dimensionality)
                cpu_index_flat = faiss.IndexIVFFlat(quantizer, dimensionality, nlist, faiss.METRIC_L2)
            else:
                quantizer = faiss.IndexFlatIP(dimensionality)
                cpu_index_flat = faiss.IndexIVFFlat(quantizer, dimensionality, nlist)

            gpu_index_ivf = faiss.index_cpu_to_gpu(self.resource, 0, cpu_index_flat)
            gpu_index_ivf.train(data)
            gpu_index_ivf.add(data)
            self.index = gpu_index_ivf 
Example #2
Source File: h5_to_faiss.py    From Seq2Seq-Vis with Apache License 2.0 6 votes vote down vote up
def main():
    f = h5py.File(opt.states, "r")
    data = f[opt.data]
    seqs, slens, hid = data.shape

    print("Processing {} Sequences".format(seqs))
    print("with {} tokens each".format(slens))
    print("and {} states".format(hid))

    # Initialize a new index
    index = faiss.IndexFlatIP(hid)
    # Fill it
    for ix in tqdm(range(0, seqs-opt.stepsize, opt.stepsize)):
        cdata = np.array(data[ix:ix+opt.stepsize]\
                  .reshape(-1, hid), dtype="float32")
        index.add(cdata)
    f.close()

    faiss.write_index(index, opt.output) 
Example #3
Source File: indexing.py    From faiss-server with MIT License 5 votes vote down vote up
def do_indexing(word2vec_model=None):
    if not os.path.isfile(INDEX_FILE_PATH):
        index = faiss.IndexFlatIP(word2vec_model.vector_size)
        index.add(word2vec_model.wv.syn0norm)
        faiss.write_index(index, INDEX_FILE_PATH)
        return index
    else:
        return faiss.read_index(INDEX_FILE_PATH) 
Example #4
Source File: phrase_embed.py    From diora with Apache License 2.0 5 votes vote down vote up
def __init__(self, dim=None):
        super(Index, self).__init__()
        self.D, self.I = None, None
        self.index = faiss.IndexFlatIP(dim) 
Example #5
Source File: faiss_indexer.py    From BLINK with MIT License 5 votes vote down vote up
def __init__(self, vector_sz: int = 1, buffer_size: int = 50000):
        super(DenseFlatIndexer, self).__init__(buffer_size=buffer_size)
        self.index = faiss.IndexFlatIP(vector_sz) 
Example #6
Source File: utils.py    From DeMa-BWE with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_nn_avg_dist(emb, query, knn):
    """
    Compute the average distance of the `knn` nearest neighbors
    for a given set of embeddings and queries.
    Use Faiss if available.
    """
    if FAISS_AVAILABLE:
        emb = emb.cpu().numpy()
        query = query.cpu().numpy()
        if hasattr(faiss, 'StandardGpuResources'):
            # gpu mode
            res = faiss.StandardGpuResources()
            config = faiss.GpuIndexFlatConfig()
            config.device = 0
            index = faiss.GpuIndexFlatIP(res, emb.shape[1], config)
        else:
            # cpu mode
            index = faiss.IndexFlatIP(emb.shape[1])
        index.add(emb)
        distances, _ = index.search(query, knn)
        return distances.mean(1)
    else:
        bs = 1024
        all_distances = []
        emb = emb.transpose(0, 1).contiguous()
        for i in range(0, query.shape[0], bs):
            distances = query[i:i + bs].mm(emb)
            best_distances, _ = distances.topk(knn, dim=1, largest=True, sorted=True)
            all_distances.append(best_distances.mean(1).cpu())
        all_distances = torch.cat(all_distances)
        return all_distances.numpy() 
Example #7
Source File: create_faiss.py    From exbert with Apache License 2.0 5 votes vote down vote up
def train_indexes(ce:CorpusDataWrapper, stepsize=100, drop_null=True):
    """

    Parameters:
    ===========
    - corpus_embedding: Wrapper around HDF5 file for easy access to data
    - stepsize: How many sentences to train with at once
    - drop_null: Don't index the embeddings of special tokens (e.g., [CLS] and [SEP]) whose spacy POS are null
    """
    NUM_LAYERS = ce.n_layers # want to account for the input layer, which for attentions + contexts is all value 0
    
    embedding_indexes = [faiss.IndexFlatIP(ce.embedding_dim) for i in range(NUM_LAYERS)]
    context_indexes = [faiss.IndexFlatIP(ce.embedding_dim) for i in range(NUM_LAYERS)]

    for ix in range(0, len(ce), stepsize):
        cdata = ce[ix:ix+stepsize]

        if drop_null: 
            embeddings = np.concatenate([c.zero_special_embeddings for c in cdata], axis=1)
            contexts = np.concatenate([c.zero_special_contexts for c in cdata], axis=1)
        else:
            embeddings = np.concatenate([c.embeddings for c in cdata], axis=1)
            contexts = np.concatenate([c.contexts for c in cdata], axis=1)

        for i in range(NUM_LAYERS):
            embedding_indexes[i].add(embeddings[i])
            context_indexes[i].add(contexts[i])
            
    return embedding_indexes, context_indexes 
Example #8
Source File: submit_retrieval.py    From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0 5 votes vote down vote up
def predict_landmark_id(ids_query, feats_query, ids_train, feats_train, landmark_dict, voting_k=3):
    print('build index...')
    cpu_index = faiss.IndexFlatIP(feats_train.shape[1])
    cpu_index.add(feats_train)
    sims, topk_idx = cpu_index.search(x=feats_query, k=voting_k)
    print('query search done.')

    df = pd.DataFrame(ids_query, columns=['id'])
    df['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_train[topk_idx])

    rows = []
    for imidx, (_, r) in tqdm.tqdm(enumerate(df.iterrows()), total=len(df)):
        image_ids = [name.split('/')[-1] for name in r.images.split(' ')]
        counter = Counter()
        for i, image_id in enumerate(image_ids[:voting_k]):
            landmark_id = landmark_dict[image_id]

            counter[landmark_id] += sims[imidx, i]

        landmark_id, score = counter.most_common(1)[0]
        rows.append({
            'id': r['id'],
            'landmarks': f'{landmark_id} {score:.9f}',
        })

    pred = pd.DataFrame(rows).set_index('id')
    pred['landmark_id'], pred['score'] = list(
        zip(*pred['landmarks'].apply(lambda x: str(x).split(' '))))
    pred['score'] = pred['score'].astype(np.float32) / voting_k

    return pred 
Example #9
Source File: reranking.py    From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0 5 votes vote down vote up
def __init__(self, database, method):
        super().__init__(database, method)
        self.index = {'cosine': faiss.IndexFlatIP,
                      'euclidean': faiss.IndexFlatL2}[method](self.D)
        if os.environ.get('CUDA_VISIBLE_DEVICES'):
            print('CUDA', os.environ.get('CUDA_VISIBLE_DEVICES'))
            self.index = faiss.index_cpu_to_all_gpus(self.index)
        self.add() 
Example #10
Source File: reranking.py    From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0 5 votes vote down vote up
def __init__(self, database, method, M=128, nbits=8, nlist=316, nprobe=32):
        super().__init__(database, method)
        self.quantizer = {'cosine': faiss.IndexFlatIP,
                          'euclidean': faiss.IndexFlatL2}[method](self.D)
        self.index = faiss.IndexIVFPQ(self.quantizer, self.D, nlist, M, nbits)
        samples = database[np.random.permutation(np.arange(self.N))[:self.N]]
        print("[ANN] train")
        self.index.train(samples)
        self.add()
        self.index.nprobe = nprobe 
Example #11
Source File: faiss.py    From gntp with MIT License 5 votes vote down vote up
def _build_exact_index(self,
                               data: np.ndarray):
            dimensionality = data.shape[1]

            if self.kernel_name in {'rbf'}:
                self.cpu_index_flat = faiss.IndexFlatL2(dimensionality)
            else:
                self.cpu_index_flat = faiss.IndexFlatIP(dimensionality)

            if not self.cpu:
                self.index = faiss.index_cpu_to_gpu(self.resource, 0, self.cpu_index_flat)
            else:
                self.index = self.cpu_index_flat
            self.index.add(data) 
Example #12
Source File: streamlit_demo.py    From RecNN with Apache License 2.0 5 votes vote down vote up
def get_index():
    import faiss
    from sklearn.preprocessing import normalize
    # test indexes
    indexL2 = faiss.IndexFlatL2(128)
    indexIP = faiss.IndexFlatIP(128)
    indexCOS = faiss.IndexFlatIP(128)

    mov_mat, _, _ = get_embeddings()
    mov_mat = mov_mat.numpy().astype('float32')
    indexL2.add(mov_mat)
    indexIP.add(mov_mat)
    indexCOS.add(normalize(mov_mat, axis=1, norm='l2'))
    return {'L2': indexL2, 'IP': indexIP, 'COS': indexCOS} 
Example #13
Source File: knn.py    From diffusion with MIT License 5 votes vote down vote up
def __init__(self, database, method):
        super().__init__(database, method)
        self.index = {'cosine': faiss.IndexFlatIP,
                      'euclidean': faiss.IndexFlatL2}[method](self.D)
        if os.environ.get('CUDA_VISIBLE_DEVICES'):
            self.index = faiss.index_cpu_to_all_gpus(self.index)
        self.add() 
Example #14
Source File: knn.py    From diffusion with MIT License 5 votes vote down vote up
def __init__(self, database, method, M=128, nbits=8, nlist=316, nprobe=64):
        super().__init__(database, method)
        self.quantizer = {'cosine': faiss.IndexFlatIP,
                          'euclidean': faiss.IndexFlatL2}[method](self.D)
        self.index = faiss.IndexIVFPQ(self.quantizer, self.D, nlist, M, nbits)
        samples = database[np.random.permutation(np.arange(self.N))[:self.N // 5]]
        print("[ANN] train")
        self.index.train(samples)
        self.add()
        self.index.nprobe = nprobe 
Example #15
Source File: knn.py    From learn-to-cluster with MIT License 4 votes vote down vote up
def __init__(self,
                 feats,
                 k,
                 index_path='',
                 index_key='',
                 nprobe=128,
                 omp_num_threads=None,
                 rebuild_index=True,
                 verbose=True,
                 **kwargs):
        import faiss
        if omp_num_threads is not None:
            faiss.omp_set_num_threads(omp_num_threads)
        self.verbose = verbose
        with Timer('[faiss] build index', verbose):
            if index_path != '' and not rebuild_index and os.path.exists(
                    index_path):
                print('[faiss] read index from {}'.format(index_path))
                index = faiss.read_index(index_path)
            else:
                feats = feats.astype('float32')
                size, dim = feats.shape
                index = faiss.IndexFlatIP(dim)
                if index_key != '':
                    assert index_key.find(
                        'HNSW') < 0, 'HNSW returns distances insted of sims'
                    metric = faiss.METRIC_INNER_PRODUCT
                    nlist = min(4096, 8 * round(math.sqrt(size)))
                    if index_key == 'IVF':
                        quantizer = index
                        index = faiss.IndexIVFFlat(quantizer, dim, nlist,
                                                   metric)
                    else:
                        index = faiss.index_factory(dim, index_key, metric)
                    if index_key.find('Flat') < 0:
                        assert not index.is_trained
                    index.train(feats)
                    index.nprobe = min(nprobe, nlist)
                    assert index.is_trained
                    print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
                index.add(feats)
                if index_path != '':
                    print('[faiss] save index to {}'.format(index_path))
                    mkdir_if_no_exists(index_path)
                    faiss.write_index(index, index_path)
        with Timer('[faiss] query topk {}'.format(k), verbose):
            knn_ofn = index_path + '.npz'
            if os.path.exists(knn_ofn):
                print('[faiss] read knns from {}'.format(knn_ofn))
                self.knns = np.load(knn_ofn)['data']
            else:
                sims, nbrs = index.search(feats, k=k)
                self.knns = [(np.array(nbr, dtype=np.int32),
                              1 - np.array(sim, dtype=np.float32))
                             for nbr, sim in zip(nbrs, sims)]