Python faiss.read_index() Examples

The following are 14 code examples of faiss.read_index(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module faiss , or try the search function .
Example #1
Source File: _faiss.py    From mars with Apache License 2.0 7 votes vote down vote up
def _load_index(ctx, op, index, device_id):
    return_index_type = _get_index_type(op.return_index_type, ctx)

    if return_index_type == 'object':
        # local
        return index
    elif return_index_type == 'filename':
        # local cluster
        return faiss.read_index(index)
    else:
        # distributed
        fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
        with open(fn, 'wb') as f:
            f.write(index)
        index = faiss.read_index(f.name)
        if device_id >= 0:  # pragma: no cover
            index = _index_to_gpu(index, device_id)
        return index 
Example #2
Source File: mips.py    From denspi with Apache License 2.0 6 votes vote down vote up
def __init__(self, phrase_dump_dir, start_index_path, idx2id_path, max_answer_length, para=False,
                 num_dummy_zeros=0, cuda=False):
        if os.path.isdir(phrase_dump_dir):
            self.phrase_dump_paths = sorted(
                [os.path.join(phrase_dump_dir, name) for name in os.listdir(phrase_dump_dir) if 'hdf5' in name])
            dump_names = [os.path.splitext(os.path.basename(path))[0] for path in self.phrase_dump_paths]
            self.dump_ranges = [list(map(int, name.split('-'))) for name in dump_names]
        else:
            self.phrase_dump_paths = [phrase_dump_dir]
        self.phrase_dumps = [h5py.File(path, 'r') for path in self.phrase_dump_paths]
        self.max_answer_length = max_answer_length
        self.para = para

        print('reading %s' % start_index_path)
        self.start_index = faiss.read_index(start_index_path, faiss.IO_FLAG_ONDISK_SAME_DIR)
        self.idx_f = self.load_idx_f(idx2id_path)
        self.has_offset = not 'doc' in self.idx_f
        # with h5py.File(idx2id_path, 'r') as f:
        #     self.idx2doc_id = f['doc'][:]
        #     self.idx2para_id = f['para'][:]
        #     self.idx2word_id = f['word'][:]

        self.num_dummy_zeros = num_dummy_zeros
        self.cuda = cuda 
Example #3
Source File: run_index.py    From denspi with Apache License 2.0 6 votes vote down vote up
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False):
    quantizer = faiss.read_index(quantizer_path)
    if fine_quant == 'SQ8':
        trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2)
    elif fine_quant.startswith('PQ'):
        m = int(fine_quant[2:])
        trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8)
    else:
        raise ValueError(fine_quant)

    if cuda:
        if fine_quant.startswith('PQ'):
            print('PQ not supported on GPU; keeping CPU.')
        else:
            res = faiss.StandardGpuResources()
            gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index)
            gpu_index.train(data)
            trained_index = faiss.index_gpu_to_cpu(gpu_index)
    else:
        trained_index.train(data)
    faiss.write_index(trained_index, trained_index_path) 
Example #4
Source File: indexing.py    From faiss-server with MIT License 5 votes vote down vote up
def do_indexing(word2vec_model=None):
    if not os.path.isfile(INDEX_FILE_PATH):
        index = faiss.IndexFlatIP(word2vec_model.vector_size)
        index.add(word2vec_model.wv.syn0norm)
        faiss.write_index(index, INDEX_FILE_PATH)
        return index
    else:
        return faiss.read_index(INDEX_FILE_PATH) 
Example #5
Source File: faissVectorIndex.py    From Seq2Seq-Vis with Apache License 2.0 5 votes vote down vote up
def __init__(self, file_name, dim_vector=500, sentence_max_len=50):
        self.u = faiss.read_index(file_name)  # type: faiss.Index
        self.sentence_max_length = sentence_max_len 
Example #6
Source File: faiss_indexer.py    From BLINK with MIT License 5 votes vote down vote up
def deserialize_from(self, index_file: str):
        logger.info("Loading index from %s", index_file)
        self.index = faiss.read_index(index_file)
        logger.info(
            "Loaded index of type %s and size %d", type(self.index), self.index.ntotal
        )


# DenseFlatIndexer does exact search 
Example #7
Source File: index_wrapper.py    From exbert with Apache License 2.0 5 votes vote down vote up
def __init_indexes(self):
        for fname in self.base_dir.glob(self.pattern):
            print(fname)
            idx = fname.stem.split('_')[-1]
            self.indexes[int(idx)] = faiss.read_index(str(fname)) 
Example #8
Source File: run_index.py    From denspi with Apache License 2.0 5 votes vote down vote up
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path):
    # target_inv_path = merged_index.ivfdata
    names = os.listdir(subindex_dir)
    idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')]
    index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')]

    print('copying idx2id')
    with h5py.File(target_idx2id_path, 'w') as out:
        for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
            with h5py.File(idx2id_path, 'r') as in_:
                for key, g in in_.items():
                    offset = str(g.attrs['offset'])
                    assert key == offset
                    group = out.create_group(offset)
                    group.create_dataset('doc', data=in_['doc'])
                    group.create_dataset('para', data=in_['para'])
                    group.create_dataset('word', data=in_['word'])

    print('loading invlists')
    ivfs = []
    for index_path in tqdm(index_paths, desc='loading invlists'):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        index = faiss.read_index(index_path,
                                 faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(trained_index_path)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(
        index.nlist, index.code_size,
        target_inv_path)

    # merge all the inverted lists
    print('merging')
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in tqdm(ivfs):
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    print(ntotal)

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print('writing index')
    faiss.write_index(index, target_index_path) 
Example #9
Source File: embedding_based_indexer.py    From forte with Apache License 2.0 5 votes vote down vote up
def load(self, path: str, device: Optional[str] = None) -> None:
        r"""Load the index and meta data from ``path`` directory.

        Args:
            path (str): A path to the directory to load the index from.
            device (optional str): Device to load the index into. If None,
                value will be picked from hyperparameters.

        """

        if not os.path.exists(path):
            raise ValueError(f"Failed to load the index. {path} "
                             f"does not exist.")

        cpu_index = faiss.read_index(f"{path}/index.faiss")

        if device is None:
            device = self._config.device

        if device.lower().startswith("gpu"):
            gpu_resource = faiss.StandardGpuResources()
            gpu_id = int(device[3:])
            if faiss.get_num_gpus() < gpu_id:
                gpu_id = 0
                logging.warning("Cannot create the index on device %s. "
                                "Total number of GPUs on this machine is "
                                "%s. Using the gpu0 for the index.",
                                device, faiss.get_num_gpus())
            self._index = faiss.index_cpu_to_gpu(
                gpu_resource, gpu_id, cpu_index)

        else:
            self._index = cpu_index

        with open(f"{path}/index.meta_data", "rb") as f:
            self._meta_data = pickle.load(f) 
Example #10
Source File: label.py    From Mosaicer with MIT License 5 votes vote down vote up
def calculate(self, images):
        predicted = []
        index = faiss.read_index(self.index_path)
        with open(self.id_path) as f:
            id_json = json.load(f)
        logging.info('database load')
        imgs = self.feature.get_feature(images)
        D, I = index.search(imgs, k=1)
        for p in I:
            predicted.append(id_json[str(p[0])])
        return predicted 
Example #11
Source File: server.py    From ig65m-pytorch with MIT License 5 votes vote down vote up
def main(args):
    index = read_index(str(args.index.with_suffix(".idx")))
    index.nprobe = args.num_probes

    with args.index.with_suffix(".json").open() as fp:
        metadata = json.load(fp)

    def query(batch, n):
        feats = np.frombuffer(batch.data, dtype=np.float32)
        feats = rearrange(feats, "(n d) -> n d", d=args.dimension)
        assert len(feats.shape) == 2
        assert feats.shape[1] == args.dimension
        assert feats.dtype == np.float32

        dists, indices = index.search(feats, n)

        meta = [[metadata[i] for i in batch] for batch in indices]

        return dists.tolist(), indices.tolist(), meta

    with SimpleXMLRPCServer((args.host, args.port), logRequests=False) as server:
        server.register_function(query)

        try:
            print("⏳ Waiting for similarity calls on {}:{}".format(args.host, args.port), file=sys.stderr)
            server.serve_forever()
        except KeyboardInterrupt:
            print("\n⌛ Done", file=sys.stderr) 
Example #12
Source File: remove_doc_id.py    From denspi with Apache License 2.0 4 votes vote down vote up
def remove_doc_ids(args):
    if os.path.isdir(args.subindex_dir):
        names = os.listdir(args.subindex_dir)
        index_names = [name for name in names if name.endswith('.faiss')]
        index_paths = [os.path.join(args.subindex_dir, name) for name in index_names]
        target_paths = [os.path.join(args.target_dir, name) for name in index_names]
        idx2id_paths = [path.replace('.faiss', '.hdf5') for path in index_paths]
        if not os.path.exists(args.target_dir):
            os.makedirs(args.target_dir)

        with open(args.ignore_path, 'r') as fp:
            ignore_counter = json.load(fp)
        count = sum(ignore_counter.values())
        th = count * args.ratio
        ignores = [int(key) for key, val in ignore_counter.items() if val > th]
        print('thresholding at %.1f, removing following document ids:' % th)
        for ignore in ignores:
            print(ignore)

        for idx2id_path, index_path, target_path in zip(idx2id_paths, tqdm(index_paths), target_paths):
            with h5py.File(idx2id_path, 'r') as f:
                doc_ids = f['doc'][:]
                offset = f.attrs['offset']
            idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
            if len(idxs) > 0:
                idxs = idxs + offset
                print('found %d ids to remove' % len(idxs))
                index = faiss.read_index(index_path)
                index.remove_ids(idxs)
                faiss.write_index(index, target_path)
            else:
                print('no ignore list found at %s' % index_path)
    else:
        index_path = args.subindex_dir
        target_path = args.target_dir
        idx2id_path = args.subindex_dir.replace('index.faiss', 'idx2id.hdf5')
        with open(args.ignore_path, 'r') as fp:
            ignores = np.array(list(map(int, json.load(fp))))
        with h5py.File(idx2id_path, 'r') as f:
            for offset, group in f.items():
                doc_ids = group['doc'][:]
                offset = int(offset)
                idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
                if len(idxs) > 0:
                    idxs = idxs + offset
                    print(idxs)
                    index = faiss.read_index(index_path)
                    index.remove_ids(idxs)
                    faiss.write_index(index, target_path)
                else:
                    print('no ignore list found at %d' % offset) 
Example #13
Source File: run_index.py    From denspi with Apache License 2.0 4 votes vote down vote up
def run_index(args):
    phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5')
    if os.path.exists(phrase_path):
        dump_paths = [phrase_path]
    else:
        dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase'))
        dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')]

    data = None

    if args.stage in ['all', 'coarse']:
        if args.replace or not os.path.exists(args.quantizer_path):
            if not os.path.exists(args.index_dir):
                os.makedirs(args.index_dir)
            data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para,
                                         doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                         max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros,
                                         norm_th=args.norm_th)
            with open(args.max_norm_path, 'w') as fp:
                json.dump(max_norm, fp)
            train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda)

    if args.stage in ['all', 'fine']:
        if args.replace or not os.path.exists(args.trained_index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if data is None:
                data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para,
                                      doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                      num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th)
            train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda)

    if args.stage in ['all', 'add']:
        if args.replace or not os.path.exists(args.index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if args.dump_paths is not None:
                dump_paths = args.dump_paths
                if not os.path.exists(args.subindex_dir):
                    os.makedirs(args.subindex_dir)
            add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path,
                         max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda,
                         num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th,
                         fine_quant=args.fine_quant)

    if args.stage == 'merge':
        if args.replace or not os.path.exists(args.index_path):
            merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path)

    if args.stage == 'move':
        index = faiss.read_index(args.trained_index_path)
        invlists = faiss.OnDiskInvertedLists(
            index.nlist, index.code_size,
            args.inv_path)
        index.replace_invlists(invlists)
        faiss.write_index(index, args.index_path) 
Example #14
Source File: knn.py    From learn-to-cluster with MIT License 4 votes vote down vote up
def __init__(self,
                 feats,
                 k,
                 index_path='',
                 index_key='',
                 nprobe=128,
                 omp_num_threads=None,
                 rebuild_index=True,
                 verbose=True,
                 **kwargs):
        import faiss
        if omp_num_threads is not None:
            faiss.omp_set_num_threads(omp_num_threads)
        self.verbose = verbose
        with Timer('[faiss] build index', verbose):
            if index_path != '' and not rebuild_index and os.path.exists(
                    index_path):
                print('[faiss] read index from {}'.format(index_path))
                index = faiss.read_index(index_path)
            else:
                feats = feats.astype('float32')
                size, dim = feats.shape
                index = faiss.IndexFlatIP(dim)
                if index_key != '':
                    assert index_key.find(
                        'HNSW') < 0, 'HNSW returns distances insted of sims'
                    metric = faiss.METRIC_INNER_PRODUCT
                    nlist = min(4096, 8 * round(math.sqrt(size)))
                    if index_key == 'IVF':
                        quantizer = index
                        index = faiss.IndexIVFFlat(quantizer, dim, nlist,
                                                   metric)
                    else:
                        index = faiss.index_factory(dim, index_key, metric)
                    if index_key.find('Flat') < 0:
                        assert not index.is_trained
                    index.train(feats)
                    index.nprobe = min(nprobe, nlist)
                    assert index.is_trained
                    print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
                index.add(feats)
                if index_path != '':
                    print('[faiss] save index to {}'.format(index_path))
                    mkdir_if_no_exists(index_path)
                    faiss.write_index(index, index_path)
        with Timer('[faiss] query topk {}'.format(k), verbose):
            knn_ofn = index_path + '.npz'
            if os.path.exists(knn_ofn):
                print('[faiss] read knns from {}'.format(knn_ofn))
                self.knns = np.load(knn_ofn)['data']
            else:
                sims, nbrs = index.search(feats, k=k)
                self.knns = [(np.array(nbr, dtype=np.int32),
                              1 - np.array(sim, dtype=np.float32))
                             for nbr, sim in zip(nbrs, sims)]