''' read the input data, parse to int list; create mappings of user -> reviews, item -> reviews The companion iterator reads thru the input file sequentially, yielding the data of the form: user word id list, item word id list, rating (float) @author: roseck @date Feb 28, 2017 ''' from __builtin__ import dict import gzip from DatasetUtils.Review import Review class DataMgr(): def _int_list(self,int_str): '''utility fn for converting an int string to a list of int ''' return [int(w) for w in int_str.split()] def __init__(self, filename, empty_user = set()): ''' filename: inits the UBRR data from the input file empty_user: skip the reviews by this user (keeps the ratings) ''' self.empty_user = empty_user ur_map = dict() br_map = dict() cnt = 0 skipped = 0 #read the file if filename.endswith('.gz'): f = gzip.open(filename, 'r') else: f = open(filename, 'r') for line in f: vals = line.split("\t") if len(vals) == 0: continue u = vals[0] b = vals[1] r = float(vals[2]) d = vals[3].strip() if u in self.empty_user: #we are skipping this review d = '' skipped += 1 rev = Review(u, b, r, d) #review obj #store biz -> list of reviews if not br_map.has_key(b): br_map[b] = [] br_map[b].append(rev) #store user -> list of reviews if not ur_map.has_key(u): ur_map[u] = [] ur_map[u].append(rev) cnt += 1 self.biz_map = br_map self.user_map = ur_map f.close() print 'Review Data Manager Initialized with ', cnt, ' reviews' print 'Number of skipped users = ', len(self.empty_user) print 'Number of skipped reviews = ', skipped