from tensorflow.examples.tutorials.mnist.input_data import read_data_sets import tensorflow as tf import numpy as np from scipy.io import loadmat import pickle, time import os from collections import OrderedDict as odict from functools import reduce from sklearn.cluster import KMeans from bokeh.plotting import figure,output_file,output_notebook,show import bokeh class Dataset(object): def __init__(self,name,path='./datasets/'): self.path=path self.name=name.upper() if self.name=='MNIST' or self.name=='FMNIST': self.indim=784 try: self.data=read_data_sets(self.path+self.name) except OSError as err: print(str(err)) raise ValueError('Try again') elif self.name=='CIFAR10': self.indim=(32,32,3) if self.name not in os.listdir(self.path): print('Data not in path') raise ValueError() elif self.name=='GLOVE': self.indim=300 self.data=pickle.load(open(self.path+'glove30k.p','rb')) elif self.name=='SIFT': self.indim=128 self.data=loadmat(self.path+self.name+'/siftvecs.mat')['vecs'] elif self.name=='GIST': self.indim=960 self.data=loadmat(self.path+self.name+'/gistvecs.mat')['vecs'] elif self.name=='LMGIST': self.indim=512 self.data=loadmat(self.path+self.name+'/LabelMe_gist.mat')['gist'] elif self.name=='RANDOM': self.indim=128 self.data=np.random.random(size=(100_000,self.indim)) #np.random.randn(100_000,self.indim) def train_batches(self,batch_size=64,sub_mean=False,maxsize=-1): if self.name in ['MNIST','FMNIST']: max_=self.data.train.images.shape[0]-batch_size if maxsize==-1 else maxsize-batch_size for idx in range(0,max_,batch_size): batch_x=self.data.train.images[idx:idx+batch_size,:] batch_y=self.data.train.labels[idx:idx+batch_size] batch_y=np.eye(10)[batch_y] if sub_mean: batch_x=batch_x-batch_x.mean(axis=1)[:,None] yield batch_x,batch_y elif self.name=='CIFAR10': for batch_num in [1,2,3,4,5]: filename=self.name+'/train_batch_'+str(batch_num)+'.p' with open(filename,mode='rb') as f: features,labels=pickle.load(f) for begin in range(0,len(features),batch_size): end=min(begin+batch_size,len(features)) yield features[begin:end],labels[begin:end] elif self.name in ['GLOVE','SIFT','LMGIST','RANDOM']: max_=self.data.shape[0]-batch_size if maxsize==-1 else maxsize-batch_size for idx in range(0,max_,batch_size): batch_x=self.data[idx:idx+batch_size,:] if sub_mean: batch_x=batch_x-batch_x.mean(axis=1)[:,None] yield batch_x,None def test_set(self,maxsize=-1,sub_mean=False): #maxsize determines how many elements of test set to return if self.name in ['MNIST','FMNIST']: test_x=self.data.test.images[:maxsize] test_y=np.eye(10)[self.data.test.labels[:maxsize]] if sub_mean: test_x=test_x-test_x.mean(axis=1)[:,None] return (test_x,test_y) elif self.name=='CIFAR10': with open(self.path+self.name+'/test_batch.p',mode='rb') as f: features,labels=pickle.load(f) test_x,test_y=features[:maxsize],labels[:maxsize] if sub_mean: test_x=test_x-test_x.mean(axis=1)[:,None] return test_x,test_y elif self.name in ['GLOVE','SIFT','LMGIST','RANDOM']: test_x=self.data[:maxsize] #test_y=np.eye(10)[self.data.test.labels[:maxsize]] if sub_mean: test_x=test_x-test_x.mean(axis=1)[:,None] return (test_x,None) class LSH(object): def __init__(self,data,hash_length): """ data: Nxd matrix hash_length: scalar sampling_ratio: fraction of input dims to sample from when producing a hash (ratio of PNs that each KC samples from) embedding_size: dimensionality of projection space, m """ self.hash_length=hash_length self.data=data-np.mean(data,axis=1)[:,None] self.weights=np.random.random((data.shape[1],hash_length)) self.hashes=(self.data@self.weights)>0 self.maxl1distance=2*self.hash_length def query(self,qidx,nnn,not_olap=False): L1_distances=np.sum(np.abs(self.hashes[qidx,:]^self.hashes),axis=1) #np.sum(np.bitwise_xor(self.hashes[qidx,:],self.hashes),axis=1) nnn=min(self.hashes.shape[0],nnn) if not_olap: no_overlaps=np.sum(L1_distances==self.maxl1distance) return no_overlaps NNs=L1_distances.argsort() NNs=NNs[(NNs != qidx)][:nnn] #print(L1_distances[NNs]) #an interesting property of this hash is that the L1 distances are always even return NNs def true_nns(self,qidx,nnn): sample=self.data[qidx,:] tnns=np.sum((self.data-sample)**2,axis=1).argsort()[:nnn+1] tnns=tnns[(tnns!=qidx)] if nnn<self.data.shape[0]: assert len(tnns)==nnn, 'nnn={}'.format(nnn) return tnns def construct_true_nns(self,indices,nnn): all_NNs=np.zeros((len(indices),nnn)) for idx1,idx2 in enumerate(indices): all_NNs[idx1,:]=self.true_nns(idx2,nnn) return all_NNs def AP(self,predictions,truth): assert len(predictions)==len(truth) or len(predictions)==self.hashes.shape[0] #removed conversion to list in next line: precisions=[len((set(predictions[:idx]).intersection(set(truth[:idx]))))/idx for\ idx in range(1,len(truth)+1)] return np.mean(precisions) def PR(self,qidx,truth,atindices): """truth should be a set""" L1_distances=np.sum((self.hashes[qidx,:]^self.hashes),axis=1) NNs=L1_distances.argsort() NNs=NNs[(NNs != qidx)] #predictions=NNs recalls=np.arange(1,len(truth)+1) all_recalls=[len(set(NNs[:idx])&truth) for idx in atindices] #all_recalls.append(len(set(NNs)&truth)) #all_recalls=[len(set(predictions[:idx])&truth) for idx in range(1,self.hashes.shape[0]+1)] #indices=[all_recalls.index(recall) for recall in recalls] precisions= [recall/(idx+1) for idx,recall in zip(atindices,all_recalls)] #this_pr=odict({l:(p,r) for l,p,r in zip(atL1,precisions,recalls)}) return precisions,all_recalls #(precisions,all_recalls) def ROC(self,qidx,truth,atindices): """x: False positive rate, y: True positive rate, truth should be a set""" L1_distances=np.sum((self.hashes[qidx,:]^self.hashes),axis=1) NNs=L1_distances.argsort() NNs=NNs[(NNs != qidx)] x,y=[],[] for idx in atindices: ntruepos=len((set(NNs[:idx])&truth)) #number of positives correctly classified nfalseneg=idx-ntruepos #number of negatives incorrectly classified tpr= ntruepos/len(truth) #positives correctly classified / total positives fpr= nfalseneg/(len(NNs)-len(truth)) #negatives incorrectly classified / total negatives x.append(fpr) y.append(tpr) return x,y def findmAP(self,nnn,n_points): start=np.random.randint(low=0,high=self.data.shape[0]-n_points) sample_indices=np.random.choice(self.data.shape[0],n_points) all_NNs=self.construct_true_nns(sample_indices,nnn) self.allAPs=[] for eidx,didx in enumerate(sample_indices): #eidx: enumeration id, didx: index of sample in self.data this_nns=self.query(didx,nnn) #print(len(this_nns)) this_AP=self.AP(list(this_nns),list(all_NNs[eidx,:])) #print(this_AP) self.allAPs.append(this_AP) return np.mean(self.allAPs) def findZKk(self,n_points): """ ZKk is the number of vectors whose overlap with a specific vector is zero """ sample_indices=np.random.choice(self.data.shape[0],n_points) no_overlaps=[] for eidx,didx in enumerate(sample_indices): no_overlaps.append(self.query(didx,-20,not_olap=True)) return np.mean(no_overlaps) def computePRC(self,n_points=1,nnn=200,atindices=None): """ This function calculates precision-recall metrics for model """ def replacenans(x): nanidxs=[idx for idx in range(len(x)) if np.isnan(x[idx])] notnang= lambda idx: [nidx for nidx in range(idx+1,len(x)) if nidx not in nanidxs][0] notnans= lambda idx: [nidx for nidx in range(idx) if nidx not in nanidxs][-1] if len(nanidxs)==0: return x else: for nanidx in nanidxs: if nanidx==0: x[nanidx]=x[notnang(nanidx)] else: x[nanidx]=(x[notnang(nanidx)]+x[notnans(nanidx)])/2 return x sample_indices=np.random.choice(self.data.shape[0],n_points) all_NNs=self.construct_true_nns(sample_indices,nnn) self.allprecisions=np.zeros((n_points,len(atindices))) self.allrecalls=np.zeros((n_points,len(atindices))) #allprs=odict({l:[[],[]] for l in atL1}) for eidx,didx in enumerate(sample_indices): """eidx: enumeration id, didx: index of sample in self.data""" #this_nns=self.query(didx,self.hashes.shape[0]) #this is intentionally kept a large number this_p,this_r=self.PR(didx,set(all_NNs[eidx,:]),atindices) #[allprcs[r].append(p) for p,r in zip(this_p,this_r)] self.allprecisions[eidx,:]=this_p self.allrecalls[eidx,:]=this_r return [self.allprecisions.mean(axis=0),self.allrecalls.mean(axis=0)] #replacenans([np.nanmean(v) for _,v in allprcs.items()]) def computeROC(self,n_points=1,nnn=200,atindices=None): """ This function calculates receiver operator characteristics (ROC) """ sample_indices=np.random.choice(self.hashes.shape[0],n_points) all_NNs=self.construct_true_nns(sample_indices,nnn) alltprs=np.zeros((n_points,len(atindices))) allfprs=np.zeros((n_points,len(atindices))) for eidx,didx in enumerate(sample_indices): this_fpr,this_tpr=self.ROC(didx,set(all_NNs[eidx,:]),atindices) allfprs[eidx,:]=this_fpr alltprs[eidx,:]=this_tpr return [allfprs.mean(axis=0),alltprs.mean(axis=0)] def create_bins(self): if hasattr(self,'bins'): return start=time.time() self.bins=np.unique(self.hashes,axis=0) self.num_bins=self.bins.shape[0] assignment=np.zeros(self.hashes.shape[0]) for idx,_bin in enumerate(self.bins): assignment[(self.hashes==_bin).all(axis=1)]=idx self.binstopoints={bin_idx:np.flatnonzero(assignment==bin_idx) for bin_idx in range(self.bins.shape[0])} self.pointstobins={point:int(_bin) for point,_bin in enumerate(assignment)} self.timetoindex=time.time()-start def query_bins(self,qidx,search_radius=1,order=True): if not hasattr(self,'bins'): raise ValueError('Bins for model not created') query_bin=self.bins[self.pointstobins[qidx]] valid_bins=np.flatnonzero((query_bin[None,:]^self.bins).sum(axis=1)<=search_radius) all_points=reduce(np.union1d,np.array([self.binstopoints[idx] for idx in valid_bins])) if order: l1distances=(self.hashes[qidx,:]^self.hashes[all_points,:]).sum(axis=1) all_points=all_points[l1distances.argsort()] return all_points def compute_query_mAP(self,n_points,search_radius=1,order=True,nnn=None): sample_indices=np.random.choice(self.hashes.shape[0],n_points) average_precisions=[] elapsed=[] numpredicted=[] ms = lambda l:(np.mean(l),np.std(l)) for qidx in sample_indices: start=time.time() predicted=self.query_bins(qidx,search_radius) if nnn is None: elapsed.append(time.time()-start) else: if len(predicted)<nnn: #raise ValueError('Not a good search radius') continue elapsed.append(time.time()-start) numpredicted.append(len(predicted)) truenns=self.true_nns(qidx,nnn=len(predicted)) average_precisions.append(self.AP(predictions=predicted,truth=truenns)) if nnn is not None: if len(average_precisions)<0.8*nnn: raise ValueError('Not a good search radius') return [*ms(average_precisions),*ms(elapsed),*ms(numpredicted)] class product_quantization(LSH): def __init__(self,data,m,kstar): self.data=data-np.mean(data,axis=1)[:,None] D=self.data.shape[1] #D as defined in paper dstar=D//m #this will return an int assert data.shape[1]%m==0, 'please make sure that m is a divisor of D' self.kmeans=[KMeans(n_clusters=kstar,random_state=0) for _ in range(m)] self.fitted=[km.fit_predict(self.data[:,i*dstar:(i+1)*dstar]) for (i,km) in enumerate(self.kmeans)] self.approximated=np.concatenate([km.cluster_centers_[f] for f,km in zip(self.fitted,self.kmeans)],axis=1) def query(self,qidx,nnn,not_olap=False): estimated_L2=np.sum((self.data[qidx,:]-self.approximated)**2,axis=1) nnn=min(self.data.shape[0],nnn) # if not_olap: # no_overlaps=np.sum(L1_distances==self.maxl1distance) # return no_overlaps NNs=estimated_L2.argsort() NNs=NNs[(NNs != qidx)][:nnn] return NNs def PR(self,qidx,truth,atindices): """truth should be a set""" estimated_L2=np.sum((self.data[qidx,:]-self.approximated)**2,axis=1) NNs=estimated_L2.argsort() NNs=NNs[(NNs != qidx)] recalls=np.arange(1,len(truth)+1) all_recalls=[len(set(NNs[:idx])&truth) for idx in atindices] precisions= [recall/(idx+1) for idx,recall in zip(atindices,all_recalls)] return precisions,all_recalls #(precisions,all_recalls) class AELSH(LSH): def __init__(self,data,weights): """ data: Nxd matrix hash_length: scalar sampling_ratio: fraction of input dims to sample from when producing a hash (ratio of PNs that each KC samples from) embedding_size: dimensionality of projection space, m """ self.hash_length=weights.shape[1] self.data=data-np.mean(data,axis=1)[:,None] self.weights=weights self.hashes=(self.data@self.weights)>0 self.maxl1distance=2*self.hash_length class LSHpar_ensemble(object): def __init__(self,data,hash_length,K): self.n_models=K def _create_model(): mymodel=LSH(data,hash_length) mymodel.create_bins() class LSHensemble(object): def __init__(self,data,hash_length,K): self.models=[LSH(data,hash_length) for _ in range(K)] self.numsamples=data.shape[0] self.firstmodel=self.models[0] self.firstmodel.create_bins() for m in self.models[1:]: m.create_bins() del m.data #remove data self.timetoindex=sum([m.timetoindex for m in self.models]) def compute_recall(self,n_points,nnn,sr): sample_indices=np.random.choice(self.numsamples,n_points) recalls=[] elapsed=[] numpredicted=[] for qidx in sample_indices: start=time.time() #preds=np.array([m.query_bins(qidx,sr) for m in self.models]) predicted=self.firstmodel.query_bins(qidx,sr)#reduce(np.union1d,preds) if len(predicted)<nnn: raise ValueError('Not a good search radius') numpredicted.append(len(predicted)) l1distances=np.array([np.sum((m.hashes[predicted,:]^m.hashes[qidx,:]),axis=1) for m in self.models]) rankings=l1distances.mean(axis=0).argsort() #trusted_model=self.models[np.argmax([len(p) for p in preds])] #rankings=np.sum((trusted_model.hashes[predicted,:]^trusted_model.hashes[qidx,:]),axis=1).argsort() predicted=predicted[rankings][:nnn] elapsed.append(time.time()-start) trueNNs=self.firstmodel.true_nns(qidx,nnn) recalls.append(len(set(predicted)&set(trueNNs))/nnn) return [np.mean(recalls),np.std(recalls),np.mean(elapsed),np.std(elapsed),np.mean(numpredicted),np.std(numpredicted)] def compute_ens_mAP(self,n_points,nnn,sr): sample_indices=np.random.choice(self.numsamples,n_points) allAPs=[] elapsed=[] numpredicted=[] ms = lambda l:(np.mean(l),np.std(l)) for qidx in sample_indices: start=time.time() preds=np.array([m.query_bins(qidx,sr) for m in self.models]) predicted=reduce(np.union1d,preds) if len(predicted)<nnn: #raise ValueError('Not a good search radius') continue numpredicted.append(len(predicted)) l1distances=np.array([np.sum((m.hashes[predicted,:]^m.hashes[qidx,:]),axis=1) for m in self.models]) rankings=l1distances.mean(axis=0).argsort() #trusted_model=self.models[np.argmax([len(p) for p in preds])] #rankings=np.sum((trusted_model.hashes[predicted,:]^trusted_model.hashes[qidx,:]),axis=1).argsort() predicted=predicted[rankings][:nnn] elapsed.append(time.time()-start) trueNNs=self.firstmodel.true_nns(qidx,nnn) allAPs.append(self.firstmodel.AP(predicted,trueNNs)) if len(allAPs)<0.8*n_points: raise ValueError('Not a good search radius') return [*ms(allAPs),*ms(elapsed),*ms(numpredicted)] class flylsh(LSH): def __init__(self,data,hash_length,sampling_ratio,embedding_size): """ data: Nxd matrix hash_length: scalar sampling_ratio: fraction of input dims to sample from when producing a hash embedding_size: dimensionality of projection space, m Note that in Flylsh, the hash length and embedding_size are NOT the same whereas in usual LSH they are """ self.hash_length=hash_length self.embedding_size=embedding_size K=embedding_size//hash_length self.data=(data-np.mean(data,axis=1)[:,None]) num_projections=int(sampling_ratio*data.shape[1]) weights=np.random.random((data.shape[1],embedding_size)) yindices=np.arange(weights.shape[1])[None,:] xindices=weights.argsort(axis=0)[-num_projections:,:] self.weights=np.zeros_like(weights,dtype=np.bool) self.weights[xindices,yindices]= True#sparse projection vectors all_activations=(self.data@self.weights) xindices=np.arange(data.shape[0])[:,None] yindices=all_activations.argsort(axis=1)[:,-hash_length:] self.hashes=np.zeros_like(all_activations,dtype=np.bool) #threshold=np.sort(all_activations,axis=1)[:,-hash_length][:,None] self.hashes[xindices,yindices]=True #choose topk activations #self.dense_activations=all_activations #self.sparse_activations=self.hashes.astype(np.float32)*all_activations #elementwise product self.maxl1distance=2*self.hash_length self.lowd_hashes=all_activations.reshape((-1,hash_length,K)).sum(axis=-1) > 0 def create_highd_bins(self,d,rounds=1): """ This function implements a relaxed binning for FlyLSH This is only one of the many possible implementations for such a scheme d: the number of bits to match between hashes for putting them in the same bin """ self.highd_bins=self.hashes[0:1,:] #initialize hashes to first point self.highd_binstopoints,self.highd_pointstobins={},{i:[] for i in range(self.hashes.shape[0])} for round in range(rounds): for hash_idx,this_hash in enumerate(self.hashes): overlap=(self.maxl1distance-((this_hash[None,:]^self.highd_bins).sum(axis=1)))>=2*d #print(overlap.shape) if overlap.any(): indices=np.flatnonzero(overlap) #indices=indices.tolist() #print(indices) self.highd_pointstobins[hash_idx].extend(indices) for idx in indices: if idx not in self.highd_binstopoints: #print(indices,idx) self.highd_binstopoints[idx]=[] self.highd_binstopoints[idx].append(hash_idx) else: self.highd_bins=np.append(self.highd_bins,this_hash[None,:],axis=0) bin_idx=self.highd_bins.shape[0]-1 self.highd_pointstobins[hash_idx].append(bin_idx) self.highd_binstopoints[bin_idx]=[hash_idx] def create_lowd_bins(self): start=time.time() self.lowd_bins=np.unique(self.lowd_hashes,axis=0) #self.num_bins=self.bins.shape[0] assignment=np.zeros(self.lowd_hashes.shape[0]) for idx,_bin in enumerate(self.lowd_bins): assignment[(self.lowd_hashes==_bin).all(axis=1)]=idx self.lowd_binstopoints={bin_idx:np.flatnonzero(assignment==bin_idx) for bin_idx in range(self.lowd_bins.shape[0])} self.lowd_pointstobins={point:int(_bin) for point,_bin in enumerate(assignment)} self.timetoindex=time.time()-start def query_lowd_bins(self,qidx,search_radius=1,order=False): if not hasattr(self,'lowd_bins'): raise ValueError('low dimensional bins for model not created') query_bin=self.lowd_bins[self.lowd_pointstobins[qidx]] valid_bins=np.flatnonzero((query_bin[None,:]^self.lowd_bins).sum(axis=1)<=2*search_radius) all_points=reduce(np.union1d,np.array([self.lowd_binstopoints[idx] for idx in valid_bins])) if order: l1distances=(self.hashes[qidx,:]^self.hashes[all_points,:]).sum(axis=1) all_points=all_points[l1distances.argsort()] return all_points def query_highd_bins(self,qidx,order=False): if not hasattr(self,'highd_bins'): raise ValueError('high dimensional bins for model not created') valid_bins=self.highd_pointstobins[qidx] all_points=reduce(np.union1d,np.array([self.highd_binstopoints[idx] for idx in valid_bins])) if order: l1distances=(self.hashes[qidx,:]^self.hashes[all_points,:]).sum(axis=1) all_points=all_points[l1distances.argsort()] return all_points def compute_query_mAP(self,n_points,search_radius=1,order=False,qtype='lowd',nnn=None): sample_indices=np.random.choice(self.hashes.shape[0],n_points) average_precisions=[] elapsed=[] numpredicted=[] ms = lambda l:(np.mean(l),np.std(l)) for qidx in sample_indices: start=time.time() if qtype=='lowd': predicted=self.query_lowd_bins(qidx,search_radius,order) elif qtype=='highd': predicted=self.query_highd_bins(qidx,order) assert len(predicted)<self.hashes.shape[0],'All point being queried' if nnn is None: elapsed.append(time.time()-start) else: if len(predicted)<nnn: #raise ValueError('Not a good search radius') continue elapsed.append(time.time()-start) numpredicted.append(len(predicted)) predicted=predicted[:nnn] truenns=self.true_nns(qidx,nnn=len(predicted)) average_precisions.append(self.AP(predictions=predicted,truth=truenns)) if nnn is not None: if len(average_precisions)<0.8*n_points: raise ValueError('Not a good search radius') return [*ms(average_precisions),*ms(elapsed),*ms(numpredicted)] def compute_recall(self,n_points,nnn,sr): sample_indices=np.random.choice(self.data.shape[0],n_points) recalls=[] elapsed=[] numpredicted=[] for qidx in sample_indices: start=time.time() predicted=self.query_lowd_bins(qidx,sr) if len(predicted)<nnn: raise ValueError('Not a good search radius')#continue numpredicted.append(len(predicted)) rankings=np.sum((self.hashes[predicted,:]^self.hashes[qidx,:]),axis=1).argsort() predicted=predicted[rankings][:nnn] elapsed.append(time.time()-start) trueNNs=self.true_nns(qidx,nnn) recalls.append(len(set(predicted)&set(trueNNs))/nnn) return [np.mean(recalls),np.std(recalls),np.mean(elapsed),np.std(elapsed),np.mean(numpredicted),np.std(numpredicted)] def rank_and_findmAP(self,n_points,nnn): ms = lambda l:(np.mean(l),np.std(l)) average_precisions=[] elapsed=[] for idx in range(n_points): start=time.time() average_precisions.append(self.findmAP(nnn,1)) elapsed.append(time.time()-start) return [*ms(average_precisions),*ms(elapsed)] class denseflylsh(flylsh): def __init__(self,data,hash_length,sampling_ratio,embedding_size): """ data: Nxd matrix hash_length: scalar sampling_ratio: fraction of input dims to sample from when producing a hash embedding_size: dimensionality of projection space, m Note that in Flylsh, the hash length and embedding_size are NOT the same whereas in usual LSH they are """ self.hash_length=hash_length self.embedding_size=embedding_size K=embedding_size//hash_length self.data=(data-np.mean(data,axis=1)[:,None]) weights=np.random.random((data.shape[1],embedding_size)) self.weights=(weights>1-sampling_ratio) #sparse projection vectors all_activations=(self.data@self.weights) threshold=0 self.hashes=(all_activations>=threshold) #choose topk activations #self.dense_activations=all_activations #self.sparse_activations=self.hashes.astype(np.float32)*all_activations #elementwise product self.maxl1distance=2*self.hash_length self.lowd_hashes=all_activations.reshape((-1,hash_length,K)).sum(axis=-1) > 0 class lowdflylsh(LSH): def __init__(self,data,hash_length,sampling_ratio,embedding_size): """ data: Nxd matrix hash_length: scalar sampling_ratio: fraction of input dims to sample from when producing a hash embedding_size: dimensionality of projection space, m Note that in Flylsh, the hash length and embedding_size are NOT the same whereas in usual LSH they are """ #f_bits=0.5 self.hash_length=hash_length self.embedding_size=embedding_size K=embedding_size//hash_length self.data=(data-np.mean(data,axis=1)[:,None]) weights=np.random.random((data.shape[1],embedding_size)) self.weights=(weights>1-sampling_ratio) all_activations=(self.data@self.weights) self.activations=all_activations.reshape((-1,hash_length,K)).sum(axis=-1) #threshold=np.sort(self.activations,axis=1)[:,-int(f_bits*hash_length)][:,None] threshold=0 self.hashes=(self.activations>=threshold) #choose topk activations self.maxl1distance=2*self.hash_length class WTAHash(flylsh): #implements Google's WTA hash def __init__(self,data,code_length,K=4): """ hash_length: code length m in the paper """ self.hash_length=code_length #K=1/wta_ratio, assuming a WTA ratio of 5% as in Fly LSH paper to make a fair comparison self.embedding_size=K*code_length self.data=data-np.mean(data,axis=1)[:,None] #this is not needed for WTAHash self.thetas=[np.random.choice(data.shape[1],K) for _ in range(code_length)] xindices=np.arange(data.shape[0],dtype=np.int32) yindices=self.data[:,self.thetas[0]].argmax(axis=1) #this line permutes the vectors with theta[0], takes the first K elements and computes #the index corresponding to max element this_hash=np.zeros((data.shape[0],K),dtype=np.bool) # a K dim binary vector for each data point this_hash[xindices,yindices]=True #set the positions corresponding to argmax to True self.hashes=this_hash[:] for t in self.thetas[1:]: this_hash=np.zeros((data.shape[0],K),dtype=np.bool) yindices=self.data[:,t].argmax(axis=1) #same as line 162 above this_hash[xindices,yindices]=True self.hashes=np.concatenate((self.hashes,this_hash),axis=1) #concatenate all m, K dimensional binary hashes, this is a #one hot encoded version of step 2 (C_X) in Algorithm 1 of the paper. #This can also be implemented exactly as shown in the paper. I chose this way #as it allows us to use existing functions of LSH object to find mAP #self.tokens=np.sort(self.hashes.argsort(axis=1)[:,-self.hash_length:],axis=1) self.maxl1distance=2*self.hash_length class FlyWTA(LSH): def __init__(self,data,hash_length,sampling_ratio,K): """ data: Nxd matrix hash_length: scalar sampling_ratio: fraction of input dims to sample from when producing a hash embedding_size: dimensionality of projection space, m Note that in Flylsh, the hash length and embedding_size are NOT the same whereas in usual LSH they are """ self.hash_length=hash_length self.embedding_size=K*hash_length#embedding_size #K=embedding_size//hash_length self.data=(data-np.mean(data,axis=1)[:,None]) num_projections=int(sampling_ratio*data.shape[1]) weights=np.random.random((data.shape[1],self.embedding_size)) yindices=np.arange(weights.shape[1])[None,:] xindices=weights.argsort(axis=0)[-num_projections:,:] self.weights=np.zeros_like(weights,dtype=np.bool) self.weights[xindices,yindices]= True#sparse projection vectors all_activations=(self.data@self.weights) self.thetas=[np.random.choice(all_activations.shape[1],K) for _ in range(self.hash_length)] xindices=np.arange(all_activations.shape[0],dtype=np.int32) yindices=all_activations[:,self.thetas[0]].argmax(axis=1) this_hash=np.zeros((all_activations.shape[0],K),dtype=np.bool) # a K dim binary vector for each data point this_hash[xindices,yindices]=True #set the positions corresponding to argmax to True self.hashes=this_hash[:] for t in self.thetas[1:]: this_hash=np.zeros((all_activations.shape[0],K),dtype=np.bool) yindices=all_activations[:,t].argmax(axis=1) #same as line 162 above this_hash[xindices,yindices]=True self.hashes=np.concatenate((self.hashes,this_hash),axis=1) self.maxl1distance=2*self.hash_length self.lowd_hashes=all_activations.reshape((-1,hash_length,K)).sum(axis=-1) > 0 class WTAHash2(LSH): #implements Google's WTA hash def __init__(self,data,code_length,K=4): """ hash_length: code length m in the paper """ self.hash_length=code_length self.embedding_size=K*code_length self.data=data-np.mean(data,axis=1)[:,None] #this is not needed for WTAHash n_cycles= self.embedding_size//self.data.shape[1] +(self.embedding_size%self.data.shape[1]>0) self.perms=[np.random.permutation(data.shape[1]) for _ in range(n_cycles)] self.thetas=[p[idx:idx+K] for p in self.perms for idx in range(0,len(p),K)][:self.hash_length] #print(len(self.thetas)) xindices=np.arange(data.shape[0],dtype=np.int32) yindices=self.data[:,self.thetas[0]].argmax(axis=1) this_hash=np.zeros((data.shape[0],K),dtype=np.bool) # a K dim binary vector for each data point this_hash[xindices,yindices]=True #set the positions corresponding to argmax to True self.hashes=this_hash[:] for t in self.thetas[1:]: this_hash=np.zeros((data.shape[0],K),dtype=np.bool) yindices=self.data[:,t].argmax(axis=1) #same as line 162 above this_hash[xindices,yindices]=True self.hashes=np.concatenate((self.hashes,this_hash),axis=1) self.maxl1distance=2*self.hash_length class AEflylsh(LSH): #implements Fly LSH where weights are pre-specified #The weights passed to init should be learnt from an autoencoder def __init__(self,data,hash_length,sampling_ratio,weights,local=False): """ data: Nxd matrix hash_length: scalar sampling_ratio: fraction of input dims to sample from when producing a hash embedding_size: dimensionality of projection space, m weights: weights learnt from an autoencoder weights should have the same dimensionality as projection space (m) """ #assert weights.shape[1]==embedding_size, f'Expects a {embedding_size} dim embedding from {weights.shhape[1]} dim weights' self.hash_length=hash_length self.embedding_size=weights.shape[1] self.data=(data-np.mean(data,axis=1)[:,None]) if local: self.weights=(weights>=np.sort(weights,axis=0)[-int(weights.shape[0]*sampling_ratio),:][None,:]) #sparse projection vectors else: n_weights=int(np.prod(weights.shape)*sampling_ratio) self.weights=(weights>=np.sort(weights,axis=None)[-n_weights]) #sparse projection vectors all_activations=(self.data@self.weights) threshold=np.sort(all_activations,axis=1)[:,-hash_length][:,None] self.hashes=(all_activations>=threshold) #choose top k activations self.maxl1distance=2*self.hash_length class AutoEncoder(object): def __init__(self,nodes,is_sparse=False,rho=0.5,beta=2,dropconnect=False): """ nodes: a list [in_dim,n_hidden] is_sparse: bool rho: if dropout is False, sparsity factor (fraction of weights turned on) otherwise, see below beta: weight of kl_divergence loss total_loss=reconstruction_loss+beta*kl_divergence dropout: if true, rho fraction of hidden units are dropped out """ dropout=False self.in_dim=nodes[0] self.n_hidden=nodes[1] self.epochs=5 self.learn_rate=[1e-3/(2**(e//3)) for e in range(self.epochs)] self.batch_size=32 self.inputs_,self.targets,self.lr=self.get_placeholders() if is_sparse: self.rho=rho self.encode_weights=tf.Variable(tf.random_uniform([self.in_dim,self.n_hidden],minval=0,maxval=10*self.rho)) self.decode_weights=tf.Variable(tf.truncated_normal([self.in_dim,self.n_hidden],stddev=0.05)) else: self.encode_weights=tf.Variable(tf.truncated_normal([self.in_dim,self.n_hidden],stddev=0.05)) self.decode_weights=self.encode_weights #biases=tf.Variable(tf.zeros([self.n_hidden])) #we don't want to use biases if dropconnect: self.encode_weights=tf.nn.dropout(self.encode_weights,keep_prob=tf.constant(rho))*rho hlayer=tf.matmul(self.inputs_,self.encode_weights) self.hlayer=tf.nn.relu(hlayer) #hlayer: relu for MNIST, sigmoid for GloVE if dropout: self.hlayer=tf.nn.dropout(self.hlayer,keep_prob=tf.constant(rho)) output=tf.matmul(self.hlayer,tf.transpose(self.decode_weights)) self.output=tf.nn.sigmoid(output) self.recon_loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.targets,logits=self.output)) #LAST CHANGE HERE normed= lambda w: (w-tf.reduce_min(w))/(tf.reduce_max(w)-tf.reduce_min(w)) #normalize things to be between 0 and 1 if is_sparse: rho_hat=tf.reduce_mean(normed(self.encode_weights)) #axis=0 self.kl_loss=self.find_KL_div(self.rho,rho_hat) #self.kl_loss=tf.nn.l2_loss(self.weights1) self.cost=self.recon_loss+beta*self.kl_loss else: self.cost=self.recon_loss self.opt=tf.train.AdamOptimizer(self.lr).minimize(self.cost) def find_KL_div(self,rho,rho_hat): return rho*tf.log(rho)-rho*tf.log(rho_hat)+(1-rho)*tf.log(1-rho)-(1-rho)*tf.log(1-rho_hat) def get_placeholders(self): inputs_=tf.placeholder(tf.float32,[None,self.in_dim]) targets=tf.placeholder(tf.float32,[None,self.in_dim]) lr=tf.placeholder(tf.float32) return inputs_,targets,lr def train(self,data,maxsize=-1,show_recon=False): """data: a Dataset object""" with tf.Session() as sess: sess.run(tf.global_variables_initializer()) count=0 for e in range(self.epochs): for batch_x,_ in data.train_batches(self.batch_size,sub_mean=True, maxsize=maxsize): count+=1 feed={self.inputs_:batch_x,self.targets:batch_x,self.lr:self.learn_rate[e]} _=sess.run([self.opt],feed_dict=feed) #print(f'Epoch {e+1}/{self.epochs}, recon_loss={rl}') all_weights=self.encode_weights.eval() #all_inputs=data.data.train.images[:maxsize] if data.name in ['MNIST','FMNIST'] else data.data[:maxsize] #all_inputs=all_inputs-all_inputs.mean(axis=1)[:,None] #feed={self.inputs_:all_inputs} #average_activations=sess.run(tf.reduce_mean(self.hlayer,axis=0),feed_dict=feed) #average_activations=average_activations[None,:]/average_activations.max() #print(average_activations) #all_weights=-np.abs(np.repeat(average_activations,data.indim,axis=0)-np.maximum(0.,all_inputs).sum(axis=0)[:,None].astype(np.float32)) if show_recon: test_x,_=data.test_set(maxsize=10,sub_mean=True) feed={self.inputs_:test_x} recons=sess.run(self.output,feed_dict=feed) return (all_weights,(test_x,recons)) return all_weights class WTAAutoEncoder(AutoEncoder): def __init__(self,nodes,rho=0.1): """ nodes: a list [in_dim,n_hidden] rho: sparsity factor (fraction of top activations kept during forward pass) """ self.in_dim=nodes[0] self.n_hidden=nodes[1] self.epochs=5 self.learn_rate=[1e-3/(2**(e//3)) for e in range(self.epochs)] self.batch_size=32 self.inputs_,self.targets,self.lr=self.get_placeholders() self.rho=rho self.topk=int(self.n_hidden*self.rho) self.encode_weights=tf.Variable(tf.truncated_normal([self.in_dim,self.n_hidden],stddev=0.05)) self.decode_weights=tf.Variable(tf.truncated_normal([self.in_dim,self.n_hidden],stddev=0.05)) #biases=tf.Variable(tf.zeros([self.n_hidden])) #we don't want to use biases hlayer=tf.matmul(self.inputs_,self.encode_weights) hlayer=tf.nn.relu(hlayer) #hlayer: relu for MNIST, sigmoid for GloVE thresholds,_=tf.nn.top_k(hlayer,k=self.topk,sorted=True) thresholds=thresholds[:,-1] mask=(hlayer-tf.expand_dims(thresholds,1))>=0 self.hlayer=hlayer*tf.cast(mask,dtype=tf.float32) self.output=tf.matmul(self.hlayer,tf.transpose(self.decode_weights)) self.cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.targets,logits=self.output)) #LAST CHANGE HERE self.opt=tf.train.AdamOptimizer(self.lr).minimize(self.cost) colors={'LSH':'red','AELSH':'black','Fly':'green','SparseAEFly':'blue',\ 'AEFly':'orange','AE_local':'red','AE_global':'green','DenseFly':'blue',\ 'WTA':'teal','ExWTA':'black','very_sparse_RP':'black','lowfly':'blue','PQ':'pink','FlyWTA':'pink'} def plot_results(all_results,hash_lengths=None,keys=None,name='data',location='./',metric='mAP'): if hash_lengths is None: hash_lengths=sorted(all_results.keys()) if keys is None: keys=list(all_results[hash_lengths[0]].keys()) Lk=len(keys) fmt= lambda mk:mk.join([k for k in keys]) global colors if metric=='mAP': curve_ylabel='mean Average Precision (mAP)' min_y=0 mean= lambda x,n:np.mean(all_results[x][n]) stdev=lambda x,n:np.std(all_results[x][n]) elif metric=='auprc': curve_ylabel='Area under precision recall curve' min_y=0 n_trials=len(all_results[hash_lengths[0]][keys[0]]) all_precisions={hl:{k:[all_results[hl][k][i][0] for i in range(n_trials)] for k in keys} for hl in hash_lengths} all_recalls={hl:{k:[all_results[hl][k][i][1]/np.max(all_results[hl][k][i][1]) for i in range(n_trials)] for k in keys} for hl in hash_lengths} auprc= lambda hl,k,i: np.sum(np.gradient(all_recalls[hl][k][i])*all_precisions[hl][k][i]) mean= lambda hl,k:np.mean([auprc(hl,k,i) for i in range(n_trials)]) stdev=lambda hl,k:np.std([auprc(hl,k,i) for i in range(n_trials)]) #np.std(np.array(all_MAPs[x][n]),axis=0) elif metric=='auroc': curve_ylabel='Area under Receiver Operating Characteristic (ROC) curve' min_y=0.5 n_trials=len(all_results[hash_lengths[0]][keys[0]]) all_tprs={hl:{k:[all_results[hl][k][i][1] for i in range(n_trials)] for k in keys} for hl in hash_lengths} all_fprs={hl:{k:[all_results[hl][k][i][0]/np.max(all_results[hl][k][i][0]) for i in range(n_trials)] for k in keys} for hl in hash_lengths} auroc= lambda hl,k,i: np.sum(np.gradient(all_fprs[hl][k][i])*all_tprs[hl][k][i]) mean= lambda hl,k:np.mean([auroc(hl,k,i) for i in range(n_trials)]) stdev=lambda hl,k:np.std([auroc(hl,k,i) for i in range(n_trials)]) #np.std(np.array(all_MAPs[x][n]),axis=0) p=figure(x_range=[str(h) for h in hash_lengths],title=f'{fmt(",")} on {name}') delta=0.5/(Lk+1) deltas=[delta*i for i in range(-Lk,Lk)][1::2] assert len(deltas)==Lk, 'Bad luck' x_axes=np.sort(np.array([[x+d for d in deltas] for x in range(1,1+len(hash_lengths))]),axis=None) means=[mean(hashl,name) for name,hashl in zip(keys*len(hash_lengths),sorted(hash_lengths*Lk))] stds=[stdev(hashl,name) for name,hashl in zip(keys*len(hash_lengths),sorted(hash_lengths*Lk))] for i in range(len(hash_lengths)): for j in range(Lk): p.vbar(x=x_axes[Lk*i+j], width=delta, bottom=0, top=means[Lk*i+j] , color=colors[keys[j]],legend=keys[j]) err_xs=[[i,i] for i in x_axes] err_ys= [[m-s,m+s] for m,s in zip(means,stds)] p.y_range.bounds=(min_y,np.floor(10*max(means))/10 + 0.1) p.multi_line(err_xs, err_ys,line_width=2, color='black',legend='stdev') p.legend.location='top_left' p.legend.click_policy='hide' p.xaxis.axis_label='Hash length (k)/Code length (bits)' p.yaxis.axis_label= curve_ylabel output_file(f'{location+fmt("_")}_{name}.html') show(p) def plothlcurve(all_results,hl,name='data',location='./',metric='prc'): global colors assert hl in all_results.keys(), 'Provide a valid hash length' keys=list(all_results[hl].keys()) n_trials=len(all_results[hl][keys[0]]) if metric=='prc': all_ys={k:np.mean([all_results[hl][k][i][0] for i in range(n_trials)],axis=0) for k in keys} all_xs={k:np.mean([all_results[hl][k][i][1] for i in range(n_trials)],axis=0) for k in keys} all_xs={k:all_xs[k]/np.max(all_xs[k]) for k in keys} title=f'Precision recall curves for {name}, hash length={hl}' xlabel='Recall' ylabel='Precision' legend_location='top_right' elif metric=='roc': all_xs={k:np.mean([all_results[hl][k][i][0] for i in range(n_trials)],axis=0) for k in keys} all_ys={k:np.mean([all_results[hl][k][i][1] for i in range(n_trials)],axis=0) for k in keys} all_xs={k:all_xs[k]/np.max(all_xs[k]) for k in keys} title=f'ROC curves for {name}, hash length={hl}' xlabel='False Positive rate' ylabel='True Positive rate' legend_location='bottom_right' auc= lambda k: np.sum(np.gradient(all_xs[k])*all_ys[k]) aucs={k:auc(k) for k in keys} p=figure(title=title) for k in keys: leg='{}({:.2f})'.format(k,0.01*np.floor(100*np.mean(aucs[k]))) p.line(all_xs[k],all_ys[k],line_width=2,color=colors[k],legend=leg) if metric=='roc': p.line(np.arange(100)/100.0,np.arange(100)/100.0,line_width=1,line_dash='dashed',legend='random (0.5)') #show random classifier line for ROC metrics p.legend.location=legend_location p.legend.click_policy='hide' p.xaxis.axis_label=xlabel p.yaxis.axis_label=ylabel output_file(f'{location}{metric}_{name}_{hl}.html') show(p) def parse_computed(foldername): allfiles=os.listdir(foldername) mnames=['LSH','Fly','WTA'] fmlname={'LSH':'LSH','Fly':'Fly','WTA':'WTA'} #mnames=['lsh','fly','WTA'] #fmlname={'lsh':'LSH','fly':'Fly','WTA':'WTA'} hash_lengths=[4,8,16,24,32,48,64,96,128,192,256] allmaps={hl:{} for hl in hash_lengths} for hl in hash_lengths: for mnm in mnames: allmaps[hl][fmlname[mnm]]=[] possible=[f for f in allfiles if mnm+str(hl)+'_' in f] for fnm in possible: f=open(foldername+fnm,'r') allmaps[hl][fmlname[mnm]].append(float(f.read())) return allmaps if __name__=='__main__': data=Dataset('mnist') input_dim=784 #d max_index=10000 sampling_ratio=0.10 nnn=200 #number of nearest neighbours to compare, 2% of max_index as in paper hash_lengths=[2,4,8,12,16,20,24,28,32] inputs_=data.data.train.images[:max_index] all_MAPs={} for hash_length in hash_lengths: #k embedding_size= int(20*hash_length) #int(10*input_dim) #20k or 10d all_MAPs[hash_length]={} all_MAPs[hash_length]['Fly']=[] all_MAPs[hash_length]['LSH']=[] for _ in range(10): fly_model=flylsh(inputs_,hash_length,sampling_ratio,embedding_size) fly_mAP=fly_model.findmAP(nnn,1000) msg='mean average precision is equal to {:.2f}'.format(fly_mAP) #_=os.system('say "'+msg+'"') #works only on mac all_MAPs[hash_length]['Fly'].append(fly_mAP) dense_model=LSH(inputs_,hash_length) dense_mAP=dense_model.findmAP(nnn,1000) all_MAPs[hash_length]['LSH'].append(dense_mAP) msg='mean average precision is equal to {:.2f}'.format(dense_mAP) #_=os.system('say "'+msg+'"') #works only on mac print('Both models ran successfully') print(f'{hash_length} done') print(all_MAPs)