Python faiss.write_index() Examples

The following are 13 code examples of faiss.write_index(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module faiss , or try the search function

Example #1

Source File: train.py From Mosaicer with MIT License

8 votes

def faiss_train(fn_feature, root_path, index_path='train.index', id_path='data.json'):
    folder_names = os.listdir(root_path)
    logging.info('directory %s ', folder_names)
    ids = None
    vals = None
    id_json = {}
    print(folder_names)
    for idx, folder_name in enumerate(folder_names):
        id_json[str(idx)] = folder_name
        now_path = os.path.join(root_path, folder_name)
        feature_val = fn_feature(now_path)
        vals = np.concatenate((feature_val, vals), axis=0) if vals is not None else feature_val
        id_np = np.asarray([idx] * feature_val.shape[0])
        ids = np.concatenate((id_np, ids), axis=0) if ids is not None else id_np
    N, dim = vals.shape
    x = int(2 * math.sqrt(N))
    index_description = "IVF{x},Flat".format(x=x)
    index = faiss.index_factory(7 * 7 * 512, index_description, faiss.METRIC_INNER_PRODUCT)
    index.train(vals)
    index.add_with_ids(vals, ids)
    faiss.write_index(index, index_path)
    with open(id_path, 'w', encoding='utf-8') as f:
        json.dump(id_json, f, ensure_ascii=False, indent=4)
    print(id_json)
    return index, id_json

Example #2

Source File: h5_to_faiss.py From Seq2Seq-Vis with Apache License 2.0

6 votes

def main():
    f = h5py.File(opt.states, "r")
    data = f[opt.data]
    seqs, slens, hid = data.shape

    print("Processing {} Sequences".format(seqs))
    print("with {} tokens each".format(slens))
    print("and {} states".format(hid))

    # Initialize a new index
    index = faiss.IndexFlatIP(hid)
    # Fill it
    for ix in tqdm(range(0, seqs-opt.stepsize, opt.stepsize)):
        cdata = np.array(data[ix:ix+opt.stepsize]\
                  .reshape(-1, hid), dtype="float32")
        index.add(cdata)
    f.close()

    faiss.write_index(index, opt.output)

Example #3

Source File: run_index.py From denspi with Apache License 2.0

6 votes

def train_coarse_quantizer(data, quantizer_path, num_clusters, hnsw=False, niter=10, cuda=False):
    d = data.shape[1]

    index_flat = faiss.IndexFlatL2(d)
    # make it into a gpu index
    if cuda:
        res = faiss.StandardGpuResources()
        index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
    clus = faiss.Clustering(d, num_clusters)
    clus.verbose = True
    clus.niter = niter
    clus.train(data, index_flat)
    centroids = faiss.vector_float_to_array(clus.centroids)
    centroids = centroids.reshape(num_clusters, d)

    if hnsw:
        quantizer = faiss.IndexHNSWFlat(d, 32)
        quantizer.hnsw.efSearch = 128
        quantizer.train(centroids)
        quantizer.add(centroids)
    else:
        quantizer = faiss.IndexFlatL2(d)
        quantizer.add(centroids)

    faiss.write_index(quantizer, quantizer_path)

Example #4

Source File: run_index.py From denspi with Apache License 2.0

6 votes

def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False):
    quantizer = faiss.read_index(quantizer_path)
    if fine_quant == 'SQ8':
        trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2)
    elif fine_quant.startswith('PQ'):
        m = int(fine_quant[2:])
        trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8)
    else:
        raise ValueError(fine_quant)

    if cuda:
        if fine_quant.startswith('PQ'):
            print('PQ not supported on GPU; keeping CPU.')
        else:
            res = faiss.StandardGpuResources()
            gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index)
            gpu_index.train(data)
            trained_index = faiss.index_gpu_to_cpu(gpu_index)
    else:
        trained_index.train(data)
    faiss.write_index(trained_index, trained_index_path)

Example #5

Source File: embedding_based_indexer.py From forte with Apache License 2.0

6 votes

def save(self, path: str) -> None:
        r"""Save the index and meta data in ``path`` directory. The index
        will be saved as ``index.faiss`` and ``index.meta_data`` respectively
        inside ``path`` directory.

        Args:
            path (str): A path to the directory where the index will be saved

        """

        if os.path.exists(path):
            logging.warning("%s directory already exists. Index will be "
                            "saved into an existing directory", path)
        else:
            os.makedirs(path)

        cpu_index = faiss.index_gpu_to_cpu(self._index) \
            if self._index.__class__.__name__.startswith("Gpu") else self._index
        faiss.write_index(cpu_index, f"{path}/index.faiss")
        with open(f"{path}/index.meta_data", "wb") as f:
            pickle.dump(self._meta_data, f)

Example #6

Source File: _faiss.py From mars with Apache License 2.0

5 votes

def _store_index(ctx, op, index, device_id):
    return_index_type = _get_index_type(op.return_index_type, ctx)

    if return_index_type == 'object':
        # no need to serialize
        return index
    elif return_index_type == 'filename':
        # save to file, then return filename
        if device_id >= 0:  # pragma: no cover
            # for gpu, convert to cpu first
            index = faiss.index_gpu_to_cpu(index)
        fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
        faiss.write_index(index, fn)

        atexit.register(lambda: os.remove(fn))

        return fn
    else:
        if device_id >= 0:  # pragma: no cover
            # for gpu, convert to cpu first
            index = faiss.index_gpu_to_cpu(index)
        # distributed, save to file, then return in memory bytes
        fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
        faiss.write_index(index, fn)
        try:
            with open(fn, 'rb') as f:
                return f.read()
        finally:
            os.remove(fn)

Example #7

Source File: indexing.py From faiss-server with MIT License

5 votes

def do_indexing(word2vec_model=None):
    if not os.path.isfile(INDEX_FILE_PATH):
        index = faiss.IndexFlatIP(word2vec_model.vector_size)
        index.add(word2vec_model.wv.syn0norm)
        faiss.write_index(index, INDEX_FILE_PATH)
        return index
    else:
        return faiss.read_index(INDEX_FILE_PATH)

Example #8

Source File: faiss_indexer.py From BLINK with MIT License

5 votes

def serialize(self, index_file: str):
        logger.info("Serializing index to %s", index_file)
        faiss.write_index(self.index, index_file)

Example #9

Source File: create_faiss.py From exbert with Apache License 2.0

5 votes

def save_indexes(idxs, outdir, base_name=LAYER_TEMPLATE):
    """Save the faiss index into a file for each index in idxs"""

    base_dir = Path(outdir)
    if not base_dir.exists(): base_dir.mkdir(exist_ok=True, parents=True)

    out_name = str(base_dir / base_name)
    for i, idx in enumerate(idxs):
        name = out_name.format(i)
        print(f"Saving to {name}")
        faiss.write_index(idx, name)

Example #10

Source File: run_index.py From denspi with Apache License 2.0

5 votes

def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path):
    # target_inv_path = merged_index.ivfdata
    names = os.listdir(subindex_dir)
    idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')]
    index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')]

    print('copying idx2id')
    with h5py.File(target_idx2id_path, 'w') as out:
        for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
            with h5py.File(idx2id_path, 'r') as in_:
                for key, g in in_.items():
                    offset = str(g.attrs['offset'])
                    assert key == offset
                    group = out.create_group(offset)
                    group.create_dataset('doc', data=in_['doc'])
                    group.create_dataset('para', data=in_['para'])
                    group.create_dataset('word', data=in_['word'])

    print('loading invlists')
    ivfs = []
    for index_path in tqdm(index_paths, desc='loading invlists'):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        index = faiss.read_index(index_path,
                                 faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(trained_index_path)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(
        index.nlist, index.code_size,
        target_inv_path)

    # merge all the inverted lists
    print('merging')
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in tqdm(ivfs):
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    print(ntotal)

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print('writing index')
    faiss.write_index(index, target_index_path)

Example #11

Source File: remove_doc_id.py From denspi with Apache License 2.0

4 votes

def remove_doc_ids(args):
    if os.path.isdir(args.subindex_dir):
        names = os.listdir(args.subindex_dir)
        index_names = [name for name in names if name.endswith('.faiss')]
        index_paths = [os.path.join(args.subindex_dir, name) for name in index_names]
        target_paths = [os.path.join(args.target_dir, name) for name in index_names]
        idx2id_paths = [path.replace('.faiss', '.hdf5') for path in index_paths]
        if not os.path.exists(args.target_dir):
            os.makedirs(args.target_dir)

        with open(args.ignore_path, 'r') as fp:
            ignore_counter = json.load(fp)
        count = sum(ignore_counter.values())
        th = count * args.ratio
        ignores = [int(key) for key, val in ignore_counter.items() if val > th]
        print('thresholding at %.1f, removing following document ids:' % th)
        for ignore in ignores:
            print(ignore)

        for idx2id_path, index_path, target_path in zip(idx2id_paths, tqdm(index_paths), target_paths):
            with h5py.File(idx2id_path, 'r') as f:
                doc_ids = f['doc'][:]
                offset = f.attrs['offset']
            idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
            if len(idxs) > 0:
                idxs = idxs + offset
                print('found %d ids to remove' % len(idxs))
                index = faiss.read_index(index_path)
                index.remove_ids(idxs)
                faiss.write_index(index, target_path)
            else:
                print('no ignore list found at %s' % index_path)
    else:
        index_path = args.subindex_dir
        target_path = args.target_dir
        idx2id_path = args.subindex_dir.replace('index.faiss', 'idx2id.hdf5')
        with open(args.ignore_path, 'r') as fp:
            ignores = np.array(list(map(int, json.load(fp))))
        with h5py.File(idx2id_path, 'r') as f:
            for offset, group in f.items():
                doc_ids = group['doc'][:]
                offset = int(offset)
                idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
                if len(idxs) > 0:
                    idxs = idxs + offset
                    print(idxs)
                    index = faiss.read_index(index_path)
                    index.remove_ids(idxs)
                    faiss.write_index(index, target_path)
                else:
                    print('no ignore list found at %d' % offset)

Example #12

Source File: run_index.py From denspi with Apache License 2.0

4 votes

def run_index(args):
    phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5')
    if os.path.exists(phrase_path):
        dump_paths = [phrase_path]
    else:
        dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase'))
        dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')]

    data = None

    if args.stage in ['all', 'coarse']:
        if args.replace or not os.path.exists(args.quantizer_path):
            if not os.path.exists(args.index_dir):
                os.makedirs(args.index_dir)
            data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para,
                                         doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                         max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros,
                                         norm_th=args.norm_th)
            with open(args.max_norm_path, 'w') as fp:
                json.dump(max_norm, fp)
            train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda)

    if args.stage in ['all', 'fine']:
        if args.replace or not os.path.exists(args.trained_index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if data is None:
                data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para,
                                      doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                      num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th)
            train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda)

    if args.stage in ['all', 'add']:
        if args.replace or not os.path.exists(args.index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if args.dump_paths is not None:
                dump_paths = args.dump_paths
                if not os.path.exists(args.subindex_dir):
                    os.makedirs(args.subindex_dir)
            add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path,
                         max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda,
                         num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th,
                         fine_quant=args.fine_quant)

    if args.stage == 'merge':
        if args.replace or not os.path.exists(args.index_path):
            merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path)

    if args.stage == 'move':
        index = faiss.read_index(args.trained_index_path)
        invlists = faiss.OnDiskInvertedLists(
            index.nlist, index.code_size,
            args.inv_path)
        index.replace_invlists(invlists)
        faiss.write_index(index, args.index_path)

Example #13

Source File: knn.py From learn-to-cluster with MIT License

4 votes

def __init__(self,
                 feats,
                 k,
                 index_path='',
                 index_key='',
                 nprobe=128,
                 omp_num_threads=None,
                 rebuild_index=True,
                 verbose=True,
                 **kwargs):
        import faiss
        if omp_num_threads is not None:
            faiss.omp_set_num_threads(omp_num_threads)
        self.verbose = verbose
        with Timer('[faiss] build index', verbose):
            if index_path != '' and not rebuild_index and os.path.exists(
                    index_path):
                print('[faiss] read index from {}'.format(index_path))
                index = faiss.read_index(index_path)
            else:
                feats = feats.astype('float32')
                size, dim = feats.shape
                index = faiss.IndexFlatIP(dim)
                if index_key != '':
                    assert index_key.find(
                        'HNSW') < 0, 'HNSW returns distances insted of sims'
                    metric = faiss.METRIC_INNER_PRODUCT
                    nlist = min(4096, 8 * round(math.sqrt(size)))
                    if index_key == 'IVF':
                        quantizer = index
                        index = faiss.IndexIVFFlat(quantizer, dim, nlist,
                                                   metric)
                    else:
                        index = faiss.index_factory(dim, index_key, metric)
                    if index_key.find('Flat') < 0:
                        assert not index.is_trained
                    index.train(feats)
                    index.nprobe = min(nprobe, nlist)
                    assert index.is_trained
                    print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
                index.add(feats)
                if index_path != '':
                    print('[faiss] save index to {}'.format(index_path))
                    mkdir_if_no_exists(index_path)
                    faiss.write_index(index, index_path)
        with Timer('[faiss] query topk {}'.format(k), verbose):
            knn_ofn = index_path + '.npz'
            if os.path.exists(knn_ofn):
                print('[faiss] read knns from {}'.format(knn_ofn))
                self.knns = np.load(knn_ofn)['data']
            else:
                sims, nbrs = index.search(feats, k=k)
                self.knns = [(np.array(nbr, dtype=np.int32),
                              1 - np.array(sim, dtype=np.float32))
                             for nbr, sim in zip(nbrs, sims)]