Python faiss.index_factory() Examples

The following are 10 code examples of faiss.index_factory(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module faiss , or try the search function .
Example #1
Source File: train.py    From Mosaicer with MIT License 8 votes vote down vote up
def faiss_train(fn_feature, root_path, index_path='train.index', id_path='data.json'):
    folder_names = os.listdir(root_path)
    logging.info('directory %s ', folder_names)
    ids = None
    vals = None
    id_json = {}
    print(folder_names)
    for idx, folder_name in enumerate(folder_names):
        id_json[str(idx)] = folder_name
        now_path = os.path.join(root_path, folder_name)
        feature_val = fn_feature(now_path)
        vals = np.concatenate((feature_val, vals), axis=0) if vals is not None else feature_val
        id_np = np.asarray([idx] * feature_val.shape[0])
        ids = np.concatenate((id_np, ids), axis=0) if ids is not None else id_np
    N, dim = vals.shape
    x = int(2 * math.sqrt(N))
    index_description = "IVF{x},Flat".format(x=x)
    index = faiss.index_factory(7 * 7 * 512, index_description, faiss.METRIC_INNER_PRODUCT)
    index.train(vals)
    index.add_with_ids(vals, ids)
    faiss.write_index(index, index_path)
    with open(id_path, 'w', encoding='utf-8') as f:
        json.dump(id_json, f, ensure_ascii=False, indent=4)
    print(id_json)
    return index, id_json 
Example #2
Source File: _faiss.py    From mars with Apache License 2.0 6 votes vote down vote up
def execute(cls, ctx, op):
        (data,), device_id, _ = as_same_device(
            [ctx[op.input.key]], device=op.device, ret_extra=True)

        with device(device_id):
            index = faiss.index_factory(data.shape[1], op.faiss_index,
                                        op.faiss_metric_type)

            if device_id >= 0:  # pragma: no cover
                # GPU
                index = _index_to_gpu(index, device_id)
                index.train_c(data.shape[0], _swig_ptr_from_cupy_float32_array(data))
            else:
                index.train(data)

            ctx[op.outputs[0].key] = _store_index(
                ctx, op, index, device_id) 
Example #3
Source File: _faiss.py    From mars with Apache License 2.0 5 votes vote down vote up
def _execute_one_chunk(cls, ctx, op):
        (inp,), device_id, xp = as_same_device(
            [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True)

        with device(device_id):
            # create index
            index = faiss.index_factory(inp.shape[1], op.faiss_index,
                                        op.faiss_metric_type)
            # GPU
            if device_id >= 0:  # pragma: no cover
                index = _index_to_gpu(index, device_id)

            # train index
            if not index.is_trained:
                assert op.n_sample is not None
                sample_indices = xp.random.choice(inp.shape[0],
                                                  size=op.n_sample, replace=False)
                sampled = inp[sample_indices]
                index.train(sampled)

            if op.metric == 'cosine':
                # faiss does not support cosine distances directly,
                # data needs to be normalize before adding to index,
                # refer to:
                # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance
                faiss.normalize_L2(inp)
            # add vectors to index
            if device_id >= 0:  # pragma: no cover
                # gpu
                inp = inp.astype(np.float32, copy=False)
                index.add_c(inp.shape[0], _swig_ptr_from_cupy_float32_array(inp))
            else:
                index.add(inp)

            ctx[op.outputs[0].key] = _store_index(ctx, op, index, device_id) 
Example #4
Source File: _faiss.py    From mars with Apache License 2.0 5 votes vote down vote up
def build_faiss_index(X, index_name='auto', n_sample=None, metric="euclidean",
                      random_state=None, same_distribution=True,
                      accuracy=False, memory_require=None, **kw):
    X = astensor(X)

    if metric not in METRIC_TO_FAISS_METRIC_TYPE:
        raise ValueError('unknown metric: {}'.format(metric))
    if index_name != 'auto':
        try:
            faiss.index_factory(X.shape[1], index_name,
                                METRIC_TO_FAISS_METRIC_TYPE[metric])
        except RuntimeError:
            raise ValueError('illegal faiss index: {}'.format(index_name))

    rs = check_random_state(random_state)
    if isinstance(rs, RandomState):
        rs = rs.to_numpy()
    seed = gen_random_seeds(1, rs)[0]
    if memory_require is None:
        memory_require = MemoryRequirementGrade.low
    else:
        memory_require = _get_memory_require(memory_require)
    op = FaissBuildIndex(faiss_index=index_name, metric=metric,
                         n_sample=n_sample, gpu=X.op.gpu, seed=seed,
                         same_distribution=same_distribution,
                         accuracy=accuracy, memory_require=memory_require, **kw)
    return op(X) 
Example #5
Source File: faiss_gpu.py    From ann-benchmarks with MIT License 5 votes vote down vote up
def fit(self, X):
        X = X.astype(numpy.float32)
        self._index = faiss.GpuIndexIVFFlat(self._res, len(X[0]), self._n_bits,
                                            faiss.METRIC_L2)
        # self._index = faiss.index_factory(len(X[0]),
        #                                   "IVF%d,Flat" % self._n_bits)
        # co = faiss.GpuClonerOptions()
        # co.useFloat16 = True
        # self._index = faiss.index_cpu_to_gpu(self._res, 0,
        #                                      self._index, co)
        self._index.train(X)
        self._index.add(X)
        self._index.setNumProbes(self._n_probes) 
Example #6
Source File: cdp.py    From capture_reid with Apache License 2.0 5 votes vote down vote up
def cluster(features, th_knn, max_size=300, labels=None):
    '''
    与face-train不同,这里聚类的相似度没有经过1-转换
    :param features:
    :param th_knn:
    :param max_size:
    :return:
    '''
    k = 80
    nprobe = 8

    # knn
    size, dim = features.shape
    metric = faiss.METRIC_INNER_PRODUCT
    nlist = min(4096, 8 * round(math.sqrt(size)))
    if size < 4 * 10000:
        fac_str = "Flat"  # same
    elif size < 80 * 10000:
        fac_str = "IVF" + str(nlist) + ",Flat"  # same
    elif size < 200 * 10000:
        fac_str = "IVF16384,Flat"  # same
    else:
        fac_str = "IVF16384,PQ8"  # same
    logger.info("cdp cluster fac str %s", fac_str)
    index = faiss.index_factory(dim, fac_str, metric)
    index.train(features)
    index.nprobe = min(nprobe, nlist)
    assert index.is_trained
    logger.info('cdp cluster nlist: {}, nprobe: {}'.format(nlist, nprobe))
    index.add(features)

    sims, ners = index.search(features, k=k)
    if "Flat" not in fac_str:
        sims = sim_by_feature(features, features, ners)
    knns = np.concatenate([sims[:, np.newaxis].astype(np.float32), ners[:, np.newaxis].astype(np.float32)], axis=1)
    # del features

    return cluster_by_knns(knns, features, th_knn, max_size, labels) 
Example #7
Source File: _faiss.py    From mars with Apache License 2.0 4 votes vote down vote up
def _execute_map(cls, ctx, op):
        (data,), device_id, _ = as_same_device(
            [ctx[op.inputs[0].key]], device=op.device, ret_extra=True)
        index = ctx[op.inputs[1].key] if len(op.inputs) == 2 else None

        with device(device_id):
            if index is not None:
                # fetch the trained index
                trained_index = _load_index(ctx, op, index, device_id)
                return_index_type = _get_index_type(op.return_index_type, ctx)
                if return_index_type == 'object':
                    # clone a new one,
                    # because faiss does not ensure thread-safe for operations that change index
                    # https://github.com/facebookresearch/faiss/wiki/Threads-and-asynchronous-calls#thread-safety
                    trained_index = faiss.clone_index(trained_index)
            else:
                trained_index = faiss.index_factory(data.shape[1], op.faiss_index,
                                                    op.faiss_metric_type)
                if op.same_distribution:
                    # no need to train, just create index
                    pass
                else:
                    # distribution no the same, train on each chunk
                    trained_index.train(data)

                if device_id >= 0:  # pragma: no cover
                    trained_index = _index_to_gpu(trained_index, device_id)
            if op.metric == 'cosine':
                # faiss does not support cosine distances directly,
                # data needs to be normalize before adding to index,
                # refer to:
                # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance
                faiss.normalize_L2(data)

            # add data into index
            if device_id >= 0:  # pragma: no cover
                # gpu
                trained_index.add_c(data.shape[0], _swig_ptr_from_cupy_float32_array(data))
            else:
                trained_index.add(data)

            ctx[op.outputs[0].key] = _store_index(ctx, op, trained_index, device_id) 
Example #8
Source File: test_faiss.py    From mars with Apache License 2.0 4 votes vote down vote up
def testGenIndexStringAndSampleCount(self):
        d = 32

        # accuracy=True, could be Flat only
        ret = _gen_index_string_and_sample_count((10 ** 9, d), None, True, 'minimum')
        self.assertEqual(ret, ('Flat', None))

        # no memory concern
        ret = _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'maximum')
        self.assertEqual(ret, ('HNSW32', None))
        index = faiss.index_factory(d, ret[0])
        self.assertTrue(index.is_trained)

        # memory concern not much
        ret = _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'high')
        self.assertEqual(ret, ('IVF1580,Flat', 47400))
        index = faiss.index_factory(d, ret[0])
        self.assertFalse(index.is_trained)

        # memory quite important
        ret = _gen_index_string_and_sample_count((5 * 10 ** 6, d), None, False, 'low')
        self.assertEqual(ret, ('PCAR16,IVF65536_HNSW32,SQ8', 32 * 65536))
        index = faiss.index_factory(d, ret[0])
        self.assertFalse(index.is_trained)

        # memory very important
        ret = _gen_index_string_and_sample_count((10 ** 8, d), None, False, 'minimum')
        self.assertEqual(ret, ('OPQ16_32,IVF1048576_HNSW32,PQ16', 64 * 65536))
        index = faiss.index_factory(d, ret[0])
        self.assertFalse(index.is_trained)

        ret = _gen_index_string_and_sample_count((10 ** 10, d), None, False, 'low')
        self.assertEqual(ret, ('PCAR16,IVF1048576_HNSW32,SQ8', 64 * 65536))
        index = faiss.index_factory(d, ret[0])
        self.assertFalse(index.is_trained)

        with self.assertRaises(ValueError):
            # M > 64 raise error
            _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'maximum', M=128)

        with self.assertRaises(ValueError):
            # M > 64
            _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'minimum', M=128)

        with self.assertRaises(ValueError):
            # dim should be multiple of M
            _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'minimum', M=16, dim=17)

        with self.assertRaises(ValueError):
            _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'low', k=5) 
Example #9
Source File: knn.py    From learn-to-cluster with MIT License 4 votes vote down vote up
def __init__(self,
                 feats,
                 k,
                 index_path='',
                 index_key='',
                 nprobe=128,
                 omp_num_threads=None,
                 rebuild_index=True,
                 verbose=True,
                 **kwargs):
        import faiss
        if omp_num_threads is not None:
            faiss.omp_set_num_threads(omp_num_threads)
        self.verbose = verbose
        with Timer('[faiss] build index', verbose):
            if index_path != '' and not rebuild_index and os.path.exists(
                    index_path):
                print('[faiss] read index from {}'.format(index_path))
                index = faiss.read_index(index_path)
            else:
                feats = feats.astype('float32')
                size, dim = feats.shape
                index = faiss.IndexFlatIP(dim)
                if index_key != '':
                    assert index_key.find(
                        'HNSW') < 0, 'HNSW returns distances insted of sims'
                    metric = faiss.METRIC_INNER_PRODUCT
                    nlist = min(4096, 8 * round(math.sqrt(size)))
                    if index_key == 'IVF':
                        quantizer = index
                        index = faiss.IndexIVFFlat(quantizer, dim, nlist,
                                                   metric)
                    else:
                        index = faiss.index_factory(dim, index_key, metric)
                    if index_key.find('Flat') < 0:
                        assert not index.is_trained
                    index.train(feats)
                    index.nprobe = min(nprobe, nlist)
                    assert index.is_trained
                    print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
                index.add(feats)
                if index_path != '':
                    print('[faiss] save index to {}'.format(index_path))
                    mkdir_if_no_exists(index_path)
                    faiss.write_index(index, index_path)
        with Timer('[faiss] query topk {}'.format(k), verbose):
            knn_ofn = index_path + '.npz'
            if os.path.exists(knn_ofn):
                print('[faiss] read knns from {}'.format(knn_ofn))
                self.knns = np.load(knn_ofn)['data']
            else:
                sims, nbrs = index.search(feats, k=k)
                self.knns = [(np.array(nbr, dtype=np.int32),
                              1 - np.array(sim, dtype=np.float32))
                             for nbr, sim in zip(nbrs, sims)] 
Example #10
Source File: faiss_gpu.py    From learn-to-cluster with MIT License 4 votes vote down vote up
def __init__(self,
                 target,
                 nprobe=128,
                 index_factory_str=None,
                 verbose=False,
                 mode='proxy',
                 using_gpu=True):
        self._res_list = []

        num_gpu = faiss.get_num_gpus()
        print('[faiss gpu] #GPU: {}'.format(num_gpu))

        size, dim = target.shape
        assert size > 0, "size: {}".format(size)
        index_factory_str = "IVF{},PQ{}".format(
            min(8192, 16 * round(np.sqrt(size))),
            32) if index_factory_str is None else index_factory_str
        cpu_index = faiss.index_factory(dim, index_factory_str)
        cpu_index.nprobe = nprobe

        if mode == 'proxy':
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            co.usePrecomputed = False

            index = faiss.IndexProxy()
            for i in range(num_gpu):
                res = faiss.StandardGpuResources()
                self._res_list.append(res)
                sub_index = faiss.index_cpu_to_gpu(
                    res, i, cpu_index, co) if using_gpu else cpu_index
                index.addIndex(sub_index)
        elif mode == 'shard':
            co = faiss.GpuMultipleClonerOptions()
            co.useFloat16 = True
            co.usePrecomputed = False
            co.shard = True
            index = faiss.index_cpu_to_all_gpus(cpu_index,
                                                co,
                                                ngpu=num_gpu)
        else:
            raise KeyError("Unknown index mode")

        index = faiss.IndexIDMap(index)
        index.verbose = verbose

        # get nlist to decide how many samples used for training
        nlist = int([
            item for item in index_factory_str.split(",") if 'IVF' in item
        ][0].replace("IVF", ""))

        # training
        if not index.is_trained:
            indexes_sample_for_train = np.random.randint(
                0, size, nlist * 256)
            index.train(target[indexes_sample_for_train])

        # add with ids
        target_ids = np.arange(0, size)
        index.add_with_ids(target, target_ids)
        self.index = index