import sys, os, time import numpy as np import scipy as sci import scipy.sparse.linalg as slin import copy from mytools.MinTree import MinTree from scipy.sparse import coo_matrix, csr_matrix, lil_matrix from mytools.ioutil import loadedge2sm from gendenseblock import * from matricizationSVD import * from edgepropertyAnalysis import * import math class Ptype(object): freq =0 ts = 1 rate=2 @staticmethod def ptype2str(p): if p == Ptype.freq: return 'freq' if p == Ptype.ts: return 'ts' if p == Ptype.rate: return 'rate' @staticmethod def ptypes2str(ptypes): strs=[] if Ptype.freq in ptypes: strs.append(Ptype.ptype2str(Ptype.freq)) if Ptype.ts in ptypes: strs.append(Ptype.ptype2str(Ptype.ts)) if Ptype.rate in ptypes: strs.append(Ptype.ptype2str(Ptype.rate)) pstr = '-'.join(strs) return pstr class HoloScopeOpt: def __init__(self, graphmat, qfun='exp', b=32, aggmethod='sum', sdrop=True, mbd=0.5, sdropscale='linear', tsfile=None, tunit='s', ratefile=None): 'how many times of a user rates costumers if he get the cost balance' self.coe = 0 'the larger expbase can give a heavy penalty to the power-law curve' self.expbase = b self.scale = qfun self.b = b self.aggmethod=aggmethod self.suspbd = 0.0 #susp < suspbd will assign to zero self.priordropslop=sdrop self.graph=graphmat.tocoo() self.graphr = self.graph.tocsr() self.graphc = self.graph.tocsc() self.matricizetenor=None self.nU, self.nV=graphmat.shape self.indegrees = graphmat.sum(0).getA1() self.e0 = math.log(graphmat.sum(), self.nU) #logrithm of edges print 'matrix size: {} x {}\t#edges: {}'.format(self.nU, self.nV, self.indegrees.sum()) self.tsfile, self.ratefile, self.tunit = tsfile, ratefile, tunit self.tspim, self.ratepim = None, None 'field for multiple property graph' if tsfile is not None or ratefile is not None: if self.priordropslop: self.orggraph = self.graphr.copy() else: self.orggraph = self.graphr if tsfile is not None: self.mbd = mbd #multiburst bound self.tspim = MultiEedgePropBiGraph(self.orggraph) self.tspim.load_from_edgeproperty(tsfile, mtype=csr_matrix, dtype=np.int64) self.tspim.setup_ts4all_sinks(tunit) if self.priordropslop: 'slops weighted with max burst value' self.weightWithDropslop(weighted=True, scale=sdropscale) else: self.priordropslop = False #no input of time attribute if ratefile is not None: self.ratepim = MultiEedgePropBiGraph(self.orggraph) self.ratepim.load_from_edgeproperty(ratefile, mtype=csr_matrix, dtype=float) self.ratepim.setup_rate4all_sinks() 'weighed with idf prior from Fraudar' #self.weightWithIDFprior() 'if weighted the matrix the windegrees is not equal to indegrees' self.windegrees = self.graphc.sum(0).getA1() self.woutdegrees = self.graphr.sum(1).getA1() self.A = np.array([]) #binary array self.fbs = np.zeros(graphmat.shape[1], dtype=np.int) #frequency of bs in B '\frac_{ f_A{(bi)} }{ f_U{(bi)}}' self.bsusps = np.array([]) # the suspicious scores of products given A self.vx = 0 # current objective value self.vxs = [] #record all the vxs of optimizing iterations self.Y= np.array([]) self.yfbs = np.array([]) self.ybsusps = np.array([]) 'current is the best' self.bestvx = self.vx self.bestA = np.array([]) self.bestfbs = np.array([]) self.bestbsusps = np.array([]) def weightWithDropslop(self, weighted, scale): 'weight the adjacency matrix with the sudden drop of ts for each col' if weighted: colWeights = np.multiply(self.tspim.dropslops, self.tspim.dropfalls) else: colWeights = self.tspim.dropslops if scale == 'logistic': from scipy.stats import logistic from sklearn import preprocessing 'zero mean scale' colWeights = preprocessing.scale(colWeights) colWeights = logistic.cdf(colWeights) elif scale == 'linear': from sklearn import preprocessing #add a base of suspecious for each edge colWeights = preprocessing.minmax_scale(colWeights) +1 elif scale == 'plusone': colWeights += 1 elif scale == 'log1p': colWeights = np.log1p(colWeights) + 1 else: print '[Warning] no scale for the prior weight' n = self.nV colDiag = lil_matrix((n, n)) colDiag.setdiag(colWeights) self.graphr = self.graphr * colDiag.tocsr() self.graph = self.graphr.tocoo(copy=False) self.graphc = self.graph.tocsc(copy=False) print "finished computing weight matrix" def weightWithIDFprior(self): print 'weightd with IDF prior' colWeights = 1.0/np.log(self.indegrees + 5) n = self.nV colDiag = lil_matrix((n, n)) colDiag.setdiag(colWeights) self.graphr = self.graphr * colDiag.tocsr() self.graph = self.graphr.tocoo(copy=False) self.graphc = self.graph.tocsc(copy=False) return 'new objective with no f_A(v)/|A|' def maxobjfunc(self, A, fbs, bsusps=None): nu = 0.0 de = 0.0 numA = np.sum(A) de = numA + bsusps.sum() #math.sqrt(numA*bsusps.sum())#similar if numA == 0: return 0 if bsusps is not None: nu = np.dot(fbs, bsusps) else: nu = fbs.sum() res = nu/np.float64( de ) return res def aggregationMultiProp(self, mbs, method='sum'): if method == 'rank': from scipy.stats import rankdata rankmethod = 'average' k=60 #for rank fusion if len(mbs) == 1: val = mbs.values()[0] if method == 'rank': rb = rankdata(-np.array(val), method=rankmethod) return np.reciprocal(rb+k) * k else: return val if method == 'sum': 'this is the joint probability of exp form of prob' bsusps = mbs.values()[0] for v in mbs.values()[1:]: bsusps += v elif method == 'rank': 'rank fusion' arrbsusps = [] for val in mbs.values(): rb = rankdata(-np.array(val), method=rankmethod) arrbsusps.append(np.reciprocal(rb+k)) bsusps = np.array(arrbsusps).sum(0) * k else: print '[Error] Invalid method {}\n'.format(method) return bsusps #@profile def evalsusp4ts(self, suspusers, multiburstbd = 0.5, weighted=True): 'the id of suspusers consistently starts from 0 no matter the source' incnt, inratio = self.tspim.suspburstinvolv(multiburstbd, weighted, delta=True) suspts=inratio return suspts #@profile def evalsusp4rate(self, suspusers, neutral=False, scale='max'): susprates = self.ratepim.suspratedivergence(neutral, delta=True) if scale == 'max': assert(self.ratepim.maxratediv > 0) nsusprates = susprates/self.ratepim.maxratediv elif scale=='minmax': #need a copy, and do not change susprates' value for delta nsusprates = preprocessing.minmax_scale(susprates, copy=True) else: #no scale nsusprates = susprates return nsusprates 'sink suspicious with qfunc, no f_A(v)/|A|' def prodsuspicious(self, fbs, A=None, scale='exp', ptype=[Ptype.freq]): multibsusps={} if Ptype.freq in ptype: posids = self.windegrees>0 bs = np.zeros(self.nV) bs[posids] = np.divide(fbs[posids], self.windegrees[posids].astype(np.float64)) multibsusps[Ptype.freq] = bs if Ptype.ts in ptype: suspusers = A.nonzero()[0] bs = self.evalsusp4ts(suspusers, multiburstbd=self.mbd) multibsusps[Ptype.ts] = bs if Ptype.rate in ptype: suspusers = A.nonzero()[0] bs = self.evalsusp4rate(suspusers) multibsusps[Ptype.rate] = bs bsusps = self.aggregationMultiProp(multibsusps, self.aggmethod) bsusps = self.qfunc(bsusps, fbs=fbs, scale=scale, numratios=len(multibsusps)) return bsusps def initpimsuspects(self, suspusers, ptype): if Ptype.ts in ptype: self.tspim.setupsuspects(suspusers) temp1, temp2 = self.tspim.suspburstinvolv(multiburstbd=0.5, weighted=True, delta=False) if Ptype.rate in ptype: self.ratepim.setupsuspects(suspusers) tmp = self.ratepim.suspratedivergence(neutral=False, delta=False) return def start(self, A0, ptype=[Ptype.ts]): self.A = A0 users = A0.nonzero()[0] self.ptype=ptype # the property type that the postiorer uses self.fbs = self.graphr[users].sum(0).getA1() self.fbs = self.fbs.astype(np.float64, copy=False) 'initially set up currrent suspects' self.initpimsuspects(users, ptype=ptype) self.bsusps = self.prodsuspicious(self.fbs, self.A, ptype=ptype) self.vx = self.maxobjfunc(self.A, self.fbs, self.bsusps) self.vxs.append(self.vx) "current is the best" self.bestA = np.array(self.A) self.bestvx = self.vx self.bestfbs = np.array(self.fbs) self.bestbsusps = np.array(self.bsusps) def candidatefbs(self, z): 'increase or decrease' coef = 1 if self.A[z] == 0 else -1 bz = self.graphr[z] candfbs = (coef*bz + self.fbs).getA1() return candfbs #@profile def greedyshaving(self): '''greedy algorithm''' maxint = np.iinfo(np.int64).max/2 delscores = np.array([maxint]*self.nU) delcands = self.A.nonzero()[0] deluserCredit = self.graphr[delcands,:].dot(self.bsusps) delscores[delcands] = deluserCredit print 'set up the greedy min tree' MT = MinTree(delscores) i=0 sizeA = np.sum(self.A) sizeA0 = sizeA setA = set(self.A.nonzero()[0]) while len(setA) > 0: z, nextdelta = MT.getMin() setY = setA - {z} Y = copy.copy(self.A) # A is X Y[z] = 1-Y[z] self.Y=Y self.yfbs = self.candidatefbs(z) Ylist = Y.nonzero()[0] self.setdeltapimsusp(z, Ylist, add=False) self.ybsusps = self.prodsuspicious(self.yfbs, self.Y, ptype=self.ptype) vy = self.maxobjfunc(self.Y, self.yfbs, self.ybsusps) 'chose next if next if the best' if vy > self.bestvx: self.bestA = np.array(self.Y) self.bestfbs = self.yfbs self.bestbsusps = self.ybsusps self.bestvx = vy MT.changeVal(z, maxint) #make the min to the largest for deletion prodchange = self.ybsusps - self.bsusps effectprod = prodchange.nonzero()[0] if len(effectprod)>0: #this is delta for all users userdelta = self.graphc[:,effectprod].dot(prodchange[effectprod]) yuserdelta = userdelta[Ylist] for u in yuserdelta.nonzero()[0]: uidx = Ylist[u] MT.changeVal(uidx,yuserdelta[u]) 'delete next user, make current to next' self.A = self.Y sizeA -= 1 setA = setY self.fbs = self.yfbs self.bsusps = self.ybsusps self.vx = vy self.vxs.append(self.vx) if i % (sizeA0/100 + 1) == 0: sys.stdout.write('.') sys.stdout.flush() i+=1 print '' return np.sum(self.A) def initfastgreedy(self, ptype, numSing, rbd='avg'): ''' default: ptype=[Ptype.freq], numSing=10, rbd='avg' ''' self.ptype=ptype self.numSing=numSing #number of singular vectors we consider self.avgexponents=[] if len(ptype)==1: self.initfastgreedy2D(numSing, rbd) elif len(ptype) > 1: self.initfastgreedyMD(numSing, rbd) self.bestvx = -1 self.qchop=False #reciprocal of indegrees self.sindegreciprocal = csr_matrix(self.windegrees).astype(np.float64) data = self.sindegreciprocal.data nozidx = data.nonzero()[0] self.sindegreciprocal.data[nozidx] = data[nozidx]**(-1) return def tenormatricization(self, tspim, ratepim, tbindic, rbins, mtype=coo_matrix, dropweight=True, logdegree=False): 'matricize the pim of ts and rates into matrix' if tspim is None and ratepim is None: return self.graph, range(self.nV) tscm, rtcm, dl = None, None,0 if Ptype.ts in self.ptype and tspim is not None: tscm = tspim.edgeidxm.tocoo() dl = len(tscm.data) if Ptype.rate in self.ptype and ratepim is not None: rtcm = ratepim.edgeidxm.tocoo() dl = len(rtcm.data) if dropweight is True and tspim is not None: w = np.multiply(tspim.dropfalls, tspim.dropslops) w = np.log1p(w) + 1 else: w = np.ones(self.nV) xs, ys, data, colWeights = [],[],[],[] # for matricized tenor matcols, rindexcols={},{} for i in xrange(dl): if tscm is not None and rtcm is not None: assert(tscm.row[i] == rtcm.row[i] and tscm.col[i] == rtcm.col[i]) u = tscm.row[i] v = tscm.col[i] for t1, r1 in zip(tspim.eprop[tscm.data[i]], ratepim.eprop[rtcm.data[i]]): t = t1/int(tbindic[self.tunit]) r = rbins(r1) strcol = ' '.join(map(str,[v,t,r])) if strcol not in matcols: idx = len(matcols) matcols[strcol] = idx rindexcols[idx]=strcol xs.append(u) ys.append(matcols[strcol]) data.append(1.0) elif tscm is not None: u = tscm.row[i] v = tscm.col[i] for t1 in tspim.eprop[tscm.data[i]]: t = t1/int(tbindic[self.tunit]) strcol = ' '.join(map(str,[v,t])) if strcol not in matcols: idx = len(matcols) matcols[strcol] = idx rindexcols[idx]=strcol xs.append(u) ys.append(matcols[strcol]) data.append(1.0) elif rtcm is not None: u = rtcm.row[i] v = rtcm.col[i] for r1 in ratepim.eprop[rtcm.data[i]]: r = rbins(r1) strcol = ' '.join(map(str,[v,r])) if strcol not in matcols: idx = len(matcols) matcols[strcol] = idx rindexcols[idx]=strcol xs.append(u) ys.append(matcols[strcol]) data.append(1.0) else: print 'Warning: no ts and rate for matricization' return self.graph, range(self.nV) nrow, ncol = max(xs)+1, max(ys)+1 sm = mtype( (data, (xs, ys)), shape=(nrow, ncol), dtype=np.float64 ) if logdegree: print 'using log degree' sm.data[0:] = np.log1p(sm.data) if dropweight: m1, n1 = sm.shape for i in xrange(n1): pos = rindexcols[i].find(' ') v = int(rindexcols[i][:pos]) colWeights.append(w[v]) colDiag = lil_matrix((n1, n1)) colDiag.setdiag(colWeights) sm = sm * colDiag.tocsr() return sm, rindexcols def initfastgreedyMD(self, numSing, rbd): ''' use matricizationSVD instead of freq matrix svd ''' afile = self.tsfile if self.tsfile is not None else self.ratefile ipath = os.path.dirname(os.path.abspath(afile)) tbindic={'s':24*3600, 'd':30} 'edgepropertyAnalysis has already digitized the ratings' rbins = lambda x: int(x) #lambda x: 0 if x<2.5 else 1 if x<=3.5 else 2 tunit = self.tunit print 'generate tensorfile with tunit:{}, tbins:{}'.format(tunit, tbindic[tunit]) if self.matricizetenor is None: matricize_start = time.clock() sm, rindexcol = self.tenormatricization(self.tspim, self.ratepim, tbindic, rbins, mtype=coo_matrix, dropweight=self.priordropslop, logdegree=False) self.matricizetenor = sm print '::::matricize time cost: ', time.clock() - matricize_start sm = self.matricizetenor print "matricize {}x{} and svd dense... ..."\ .format(sm.shape[0], sm.shape[1]) u, s, vt = slin.svds(sm, k=numSing, which='LM') u = np.fliplr(u) s = s[::-1] CU, CV = [],[] for i in xrange(self.numSing): ui = u[:, i] si = s[i] if abs(max(ui)) < abs(min(ui)): ui = -1*ui if type(rbd) is float: sqrtSi = math.sqrt(si) ui *= sqrtSi rbdrow= rbd elif rbd == 'avg': rbdrow = 1.0/math.sqrt(self.nU) else: print 'unkown rbd {}'.format(rbd) rows = np.argsort(-ui, axis=None, kind='quicksort') for jr in xrange(len(rows)): r = rows[jr] if ui[r] <= rbdrow: break self.avgexponents.append(math.log(jr, self.nU)) 'consider the # limit' if self.nU > 1e6: e0 = self.e0 ep = max(1.6, 2.0/(3-e0)) nn = sm.shape[0] + sm.shape[1] nlimit = int(math.ceil(nn**(1/ep))) cutrows = rows[:min(jr,nlimit)] else: cutrows = rows[:jr] CU.append(cutrows) self.CU = np.array(CU) self.CV = np.array(CV) return def initfastgreedy2D(self, numSing, rbd): 'rbd threshold that cut the singular vecotors, default is avg' 'parameters for fastgreedy' u, s, vt = slin.svds(self.graphr.astype(np.float64), k=numSing, which='LM') #revert to make the largest singular values and vectors in the front u = np.fliplr(u) vt = np.flipud(vt) s = s[::-1] self.U = [] self.V = [] self.CU = [] self.CV = [] for i in xrange(self.numSing): ui = u[:, i] vi = vt[i, :] si = s[i] if abs(max(ui)) < abs(min(ui)): ui = -1*ui if abs(max(vi)) < abs(min(vi)): vi = -1*vi if type(rbd) is float: sqrtSi = math.sqrt(si) ui *= sqrtSi vi *= sqrtSi rbdrow, rbdcol = rbd, rbd elif rbd == 'avg': rbdrow = 1.0/math.sqrt(self.nU) rbdcol = 1.0/math.sqrt(self.nV) else: print 'unkown rbd {}'.format(rbd) rows = np.argsort(-ui, axis=None, kind='quicksort') cols = np.argsort(-vi, axis=None, kind='quicksort') for jr in xrange(len(rows)): r = rows[jr] if ui[r] <= rbdrow: break self.avgexponents.append(math.log(jr, self.nU)) if self.nU > 5e5: e0=self.e0 ep = max(1.6, 2.0/(3-e0)) nn = self.nU + self.nV nlimit = int(math.ceil(nn**(1.0/ep))) cutrows = rows[:min(jr,nlimit)] else: cutrows = rows[:jr] for jc in xrange(len(cols)): c = cols[jc] if vi[c] <= rbdcol: break cutcols = cols[:jc] 'begin debug' self.U.append(ui) self.V.append(vi) 'end debug' self.CU.append(cutrows) self.CV.append(cutrows) self.CU = np.array(self.CU) self.CV = np.array(self.CV) return def qfunc(self, ratios, fbs=None, scale='exp', numratios=1): if self.aggmethod == 'rank': 'do not use qfun if it is rank aggregation' return ratios if self.suspbd <= 0.0: greatbdidx = ratios > 0.0 else: greatbdidx = ratios >= self.suspbd lessbdidx = ratios < self.suspbd 'picewise q funciton if < suspbd, i.e. epsilon' ratios[lessbdidx] = 0.0 'picewise q funciton if >= suspbd, i.e. epsilon' if scale == 'exp': ratios[greatbdidx] = self.expbase**(ratios[greatbdidx]-numratios) elif scale == 'pl': ratios[greatbdidx] = ratios[greatbdidx]**self.b elif scale == 'lin': ratios[greatbdidx] = np.fmax(self.b*(ratios[greatbdidx]-1)+1, 0) else: print 'unrecognized scale: ' + scale sys.exit(1) return ratios def setdeltapimsusp(self, z, ysuspusers, add): if Ptype.ts in self.ptype: self.tspim.deltasuspects(z, ysuspusers, add) if Ptype.rate in self.ptype: self.ratepim.deltasuspects(z, ysuspusers, add) return def removecurrentblock(self, rows): '''it is for find second block, remove rows from self.graph, self.matricizetenor ''' print 'removing {} rows from graph'.format(len(rows)) lilm = self.graph.tolil() lilm[rows,:]=0 self.graph=lilm.tocoo() self.graphc= lilm.tocsc() self.graphr = self.graph.tocsr() if self.matricizetenor is not None: print 'removing {} rows from tensor'.format(len(rows)) lilmm = self.matricizetenor.tolil() lilmm[rows,:] = 0 self.matricizetenor = lilmm.tocoo() return #@profile def fastgreedy(self): 'adding and deleting greed algorithm' 'No Need: user order for r with obj fuct' self.fastlocalbest = [] self.fastbestvx = 0 self.fastbestA, self.fastbestfbs, self.fastbestbsusps = \ np.zeros(self.nU), np.zeros(self.nV), np.zeros(self.nV) for k in xrange(self.numSing): print 'process {}-th singular vector'.format(k+1) lenCU = len(self.CU[k]) if lenCU == 0: continue print '*** *** shaving ...' A0 = np.zeros(self.nU, dtype=int) A0[self.CU[k]]=1 #shaving from sub singluar space #import ipdb;ipdb.set_trace() #print 'debug: init size: ', A0.sum() self.start(A0, ptype=self.ptype) self.greedyshaving() print '*** *** shaving opt size: {}'.format(sum(self.bestA)) print '*** *** shaving opt value: {}'.format(self.bestvx) if self.fastbestvx < self.bestvx: self.fastbestvx = self.bestvx self.fastbestA = np.array(self.bestA) self.fastbestfbs = np.array(self.bestfbs) self.fastbestbsusps = np.array(self.bestbsusps) print '=== === improved opt size: {}'.format(sum(self.fastbestA)) print '=== === improved opt value: {}'.format(self.fastbestvx) brankscores = np.multiply(self.bestbsusps, self.bestfbs) A = self.bestA.nonzero()[0] self.fastlocalbest.append((self.bestvx, (A, brankscores))) 'clear shaving best' self.bestvx = 0 self.bestvx, self.bestA, self.bestfbs, self.bestbsusps = \ self.fastbestvx, self.fastbestA, \ self.fastbestfbs, self.fastbestbsusps return def drawObjectiveCurve(self, outfig): import matplotlib.pyplot as plt fig = plt.figure() plt.plot(self.vxs, '-') plt.title('The convergence curve of simulated anealing.') plt.xlabel('# of iterations') plt.ylabel('objective value') if outfig is not None: fig.savefig(outfig) return fig def HoloScope(wmat, alg, ptype, qfun, b, ratefile=None, tsfile=None, tunit='s', numSing=10, nblock=1): ''' The interface of HoloScope algorithm for external use Parameters ---------- wmat: str or sparse matrix If it is str, wmat is the input file name. We load the file into sparse matrix. If it is sparse matrix, we just use wmat. alg: str which algorithm you are going to use. You can choose 'greedy' for synthetic data (#rows+#cols<10000); or 'fastgreedy' for any size of data sets. ptype: list contains which attributes the algorithm is going to use. The hololisc use of all siginals is [Ptype.freq, Ptype.ts, Ptype.rate] qfun: str which kind of qfun the algorithm uses, choosing from 'exp' for exponential (recommended), 'pl' for power-law, 'lin' for linear b: float The base of exponetial qfun, or the exponent of power-law qfun, or absolute slope of linear qfun ratefile: str or None The file name with path for user-object rating sequences. The file format is that each line looks like 'userid-objectid:#star1 #star2 ...\n' tsfile: str or None The file name with path for user-object timestamp sequences. The file format is that each line looks like 'userid-objectid:t1 t2 ...\n' tunit: str (only support 's' or 'd') or None The time unit of input time e.g. in amazon and yelp data, the time is date, i.e. tunit='d'. We use # of days (integer) from the earlest date as input numSing: int The number of first left singular vectors used in our algorithm nblock: int The number of block we need from the algorithm Return --------- (gbestvx, (gsrows, gbscores)), opt Block (gsrows, gbscores) has the best objective values 'gbestvx' among *nblock* blocks. gbestvx: float the best objective value of the *nblock* blocks. gsrows: list is the list of suspicious rows. gbscores: list is the suspicoius scores for every objects. The index is object id, and value is the score. With the scores, you can get the suspicious rank of the objects. opt: instance of HoloScopeOpt class the class instance which contains all the *nblock* blocks in opt.nbests. opt.nbests: list This is the list contains *nblock* solutions in the form of tuple, i.e., (opt.bestvx, (srows, bscores)) ''' print 'initial...' if sci.sparse.issparse(wmat) is False and os.path.isfile(wmat): sm = loadedge2sm(wmat, coo_matrix, weighted=True, idstartzero=True) else: sm = wmat.tocoo() inprop = 'Considering ' if Ptype.freq in ptype: inprop += '+[topology] ' if Ptype.ts in ptype: assert(os.path.isfile(tsfile)) inprop += '+[timestamps] ' #elif tsfile is not None: #consider sdrop by default when Ptype.ts inprop += '+[sudden drop]' else: tsfile=None if Ptype.rate in ptype: assert(os.path.isfile(ratefile)) inprop += '+[rating i.e. # of stars] ' else: ratefile = None print inprop opt = HoloScopeOpt(sm, qfun=qfun, b=b, tsfile=tsfile, tunit=tunit, ratefile=ratefile) opt.nbests=[] opt.nlocalbests=[] #mainly used for fastgreedy gsrows,gbscores,gbestvx = 0,0,0 for k in xrange(nblock): start_time = time.clock() if alg == 'greedy': n1, n2 = sm.shape if n1 + n2 > 1e4: print '[Warning] alg {} is slow for size {}x{}'\ .format(alg, n1, n2) A = np.ones(opt.nU,dtype=int) print 'initial start' opt.start(A, ptype=ptype) print 'greedy shaving algorithm ...' opt.greedyshaving() elif alg == 'fastgreedy': print """alg: {}\n\t+ # of singlular vectors: {}\n""".format(alg, numSing) print 'initial start' opt.initfastgreedy(ptype, numSing) print "::::Finish Init @ ", time.clock() - start_time print 'fast greedy algorithm ...' opt.fastgreedy() opt.nlocalbests.append(opt.fastlocalbest) else: print 'No such algorithm: '+alg sys.exit(1) print "::::Finish Algorithm @ ", time.clock() - start_time srows = opt.bestA.nonzero()[0] bscores = np.multiply(opt.bestfbs, opt.bestbsusps) opt.nbests.append((opt.bestvx, (srows, bscores))) gsrows, gbscores, gbestvx = (srows,bscores,opt.bestvx) \ if gbestvx < opt.bestvx else (gsrows, gbscores, gbestvx) if k < nblock-1: opt.removecurrentblock(srows) print 'global best size ', len(gsrows) print 'global best value ', gbestvx return (gbestvx, (gsrows, gbscores)), opt