"""A suite of cardinality estimators. In practicular, inference algorithms for autoregressive density estimators can be found in 'ProgressiveSampling'. """ import bisect import collections import json import operator import time import numpy as np import pandas as pd import torch import common import made import transformer OPS = { '>': np.greater, '<': np.less, '>=': np.greater_equal, '<=': np.less_equal, '=': np.equal } class CardEst(object): """Base class for a cardinality estimator.""" def __init__(self): self.query_starts = [] self.query_dur_ms = [] self.errs = [] self.est_cards = [] self.true_cards = [] self.name = 'CardEst' def Query(self, columns, operators, vals): """Estimates cardinality with the specified conditions. Args: columns: list of Column objects to filter on. operators: list of string representing what operation to perform on respective columns; e.g., ['<', '>=']. vals: list of raw values to filter columns on; e.g., [50, 100000]. These are not bin IDs. Returns: Predicted cardinality. """ raise NotImplementedError def OnStart(self): self.query_starts.append(time.time()) def OnEnd(self): self.query_dur_ms.append((time.time() - self.query_starts[-1]) * 1e3) def AddError(self, err): self.errs.append(err) def AddError(self, err, est_card, true_card): self.errs.append(err) self.est_cards.append(est_card) self.true_cards.append(true_card) def __str__(self): return self.name def get_stats(self): return [ self.query_starts, self.query_dur_ms, self.errs, self.est_cards, self.true_cards ] def merge_stats(self, state): self.query_starts.extend(state[0]) self.query_dur_ms.extend(state[1]) self.errs.extend(state[2]) self.est_cards.extend(state[3]) self.true_cards.extend(state[4]) def report(self): est = self print(est.name, "max", np.max(est.errs), "99th", np.quantile(est.errs, 0.99), "95th", np.quantile(est.errs, 0.95), "median", np.quantile(est.errs, 0.5), "time_ms", np.mean(est.query_dur_ms)) def QueryToPredicate(columns, operators, vals, wrap_as_string_cols=None): """Converts from (c,o,v) to sql string (for Postgres).""" v_s = [ str(v).replace('T', ' ') if type(v) is np.datetime64 else v for v in vals ] v_s = ["\'" + v + "\'" if type(v) is str else str(v) for v in v_s] if wrap_as_string_cols is not None: for i in range(len(columns)): if columns[i].name in wrap_as_string_cols: v_s[i] = "'" + str(v_s[i]) + "'" preds = [ c.pg_name + ' ' + o + ' ' + v for c, o, v in zip(columns, operators, v_s) ] s = ' and '.join(preds) return ' where ' + s def FillInUnqueriedColumns(table, columns, operators, vals): """Allows for some columns to be unqueried (i.e., wildcard). Returns cols, ops, vals, where all 3 lists of all size len(table.columns), in the table's natural column order. A None in ops/vals means that column slot is unqueried. """ ncols = len(table.columns) cs = table.columns os, vs = [None] * ncols, [None] * ncols for c, o, v in zip(columns, operators, vals): idx = table.ColumnIndex(c.name) os[idx] = o vs[idx] = v return cs, os, vs class ProgressiveSampling(CardEst): """Progressive sampling.""" def __init__( self, model, table, r, device=None, seed=False, cardinality=None, shortcircuit=False # Skip sampling on wildcards? ): super(ProgressiveSampling, self).__init__() torch.set_grad_enabled(False) self.model = model self.table = table self.shortcircuit = shortcircuit if r <= 1.0: self.r = r # Reduction ratio. self.num_samples = None else: self.num_samples = r self.seed = seed self.device = device self.cardinality = cardinality if cardinality is None: self.cardinality = table.cardinality with torch.no_grad(): self.init_logits = self.model( torch.zeros(1, self.model.nin, device=device)) self.dom_sizes = [c.DistributionSize() for c in self.table.columns] self.dom_sizes = np.cumsum(self.dom_sizes) # Inference optimizations below. self.traced_fwd = None # We can't seem to trace this because it depends on a scalar input. self.traced_encode_input = model.EncodeInput if 'MADE' in str(model): for layer in model.net: if type(layer) == made.MaskedLinear: if layer.masked_weight is None: layer.masked_weight = layer.mask * layer.weight print('Setting masked_weight in MADE, do not retrain!') for p in model.parameters(): p.detach_() p.requires_grad = False self.init_logits.detach_() with torch.no_grad(): self.kZeros = torch.zeros(self.num_samples, self.model.nin, device=self.device) self.inp = self.traced_encode_input(self.kZeros) # For transformer, need to flatten [num cols, d_model]. self.inp = self.inp.view(self.num_samples, -1) def __str__(self): if self.num_samples: n = self.num_samples else: n = int(self.r * self.table.columns[0].DistributionSize()) return 'psample_{}'.format(n) def _sample_n(self, num_samples, ordering, columns, operators, vals, inp=None): ncols = len(columns) logits = self.init_logits if inp is None: inp = self.inp[:num_samples] masked_probs = [] # Use the query to filter each column's domain. valid_i_list = [None] * ncols # None means all valid. for i in range(ncols): natural_idx = ordering[i] # Column i. op = operators[natural_idx] if op is not None: # There exists a filter. valid_i = OPS[op](columns[natural_idx].all_distinct_values, vals[natural_idx]).astype(np.float32, copy=False) else: continue # This line triggers a host -> gpu copy, showing up as a # hotspot in cprofile. valid_i_list[i] = torch.as_tensor(valid_i, device=self.device) # Fill in wildcards, if enabled. if self.shortcircuit: for i in range(ncols): natural_idx = i if ordering is None else ordering[i] if operators[natural_idx] is None and natural_idx != ncols - 1: if natural_idx == 0: self.model.EncodeInput( None, natural_col=0, out=inp[:, :self.model. input_bins_encoded_cumsum[0]]) else: l = self.model.input_bins_encoded_cumsum[natural_idx - 1] r = self.model.input_bins_encoded_cumsum[natural_idx] self.model.EncodeInput(None, natural_col=natural_idx, out=inp[:, l:r]) # Actual progressive sampling. Repeat: # Sample next var from curr logits -> fill in next var # Forward pass -> curr logits for i in range(ncols): natural_idx = i if ordering is None else ordering[i] # If wildcard enabled, 'logits' wasn't assigned last iter. if not self.shortcircuit or operators[natural_idx] is not None: probs_i = torch.softmax( self.model.logits_for_col(natural_idx, logits), 1) valid_i = valid_i_list[i] if valid_i is not None: probs_i *= valid_i probs_i_summed = probs_i.sum(1) masked_probs.append(probs_i_summed) # If some paths have vanished (~0 prob), assign some nonzero # mass to the whole row so that multinomial() doesn't complain. paths_vanished = (probs_i_summed <= 0).view(-1, 1) probs_i = probs_i.masked_fill_(paths_vanished, 1.0) if i < ncols - 1: # Num samples to draw for column i. if i != 0: num_i = 1 else: num_i = num_samples if num_samples else int( self.r * self.dom_sizes[natural_idx]) if self.shortcircuit and operators[natural_idx] is None: data_to_encode = None else: samples_i = torch.multinomial( probs_i, num_samples=num_i, replacement=True) # [bs, num_i] data_to_encode = samples_i.view(-1, 1) # Encode input: i.e., put sampled vars into input buffer. if data_to_encode is not None: # Wildcards are encoded already. if not isinstance(self.model, transformer.Transformer): if natural_idx == 0: self.model.EncodeInput( data_to_encode, natural_col=0, out=inp[:, :self.model. input_bins_encoded_cumsum[0]]) else: l = self.model.input_bins_encoded_cumsum[natural_idx - 1] r = self.model.input_bins_encoded_cumsum[ natural_idx] self.model.EncodeInput(data_to_encode, natural_col=natural_idx, out=inp[:, l:r]) else: # Transformer. Need special treatment due to # right-shift. l = (natural_idx + 1) * self.model.d_model r = l + self.model.d_model if i == 0: # Let's also add E_pos=0 to SOS (if enabled). # This is a no-op if disabled pos embs. self.model.EncodeInput( data_to_encode, # Will ignore. natural_col=-1, # Signals SOS. out=inp[:, :self.model.d_model]) if transformer.MASK_SCHEME == 1: # Should encode natural_col \in [0, ncols). self.model.EncodeInput(data_to_encode, natural_col=natural_idx, out=inp[:, l:r]) elif natural_idx < self.model.nin - 1: # If scheme is 0, should not encode the last # variable. self.model.EncodeInput(data_to_encode, natural_col=natural_idx, out=inp[:, l:r]) # Actual forward pass. next_natural_idx = i + 1 if ordering is None else ordering[i + 1] if self.shortcircuit and operators[next_natural_idx] is None: # If next variable in line is wildcard, then don't do # this forward pass. Var 'logits' won't be accessed. continue if hasattr(self.model, 'do_forward'): # With a specific ordering. logits = self.model.do_forward(inp, ordering) else: if self.traced_fwd is not None: logits = self.traced_fwd(inp) else: logits = self.model.forward_with_encoded_input(inp) # Doing this convoluted scheme because m_p[0] is a scalar, and # we want the corret shape to broadcast. p = masked_probs[1] for ls in masked_probs[2:]: p *= ls p *= masked_probs[0] return p.mean().item() def Query(self, columns, operators, vals): # Massages queries into natural order. columns, operators, vals = FillInUnqueriedColumns( self.table, columns, operators, vals) # TODO: we can move these attributes to ctor. ordering = None if hasattr(self.model, 'orderings'): ordering = self.model.orderings[0] orderings = self.model.orderings elif hasattr(self.model, 'm'): # MADE. ordering = self.model.m[-1] orderings = [self.model.m[-1]] else: print('****Warning: defaulting to natural order') ordering = np.arange(len(columns)) orderings = [ordering] num_orderings = len(orderings) # order idx (first/second/... to be sample) -> x_{natural_idx}. inv_ordering = [None] * len(columns) for natural_idx in range(len(columns)): inv_ordering[ordering[natural_idx]] = natural_idx with torch.no_grad(): inp_buf = self.inp.zero_() # Fast (?) path. if num_orderings == 1: ordering = orderings[0] self.OnStart() p = self._sample_n( self.num_samples, ordering if isinstance( self.model, transformer.Transformer) else inv_ordering, columns, operators, vals, inp=inp_buf) self.OnEnd() return np.ceil(p * self.cardinality).astype(dtype=np.int32, copy=False) # Num orderings > 1. ps = [] self.OnStart() for ordering in orderings: p_scalar = self._sample_n(self.num_samples // num_orderings, ordering, columns, operators, vals) ps.append(p_scalar) self.OnEnd() return np.ceil(np.mean(ps) * self.cardinality).astype( dtype=np.int32, copy=False) class SampleFromModel(CardEst): """Sample from an autoregressive model.""" def __init__(self, model, table, num_samples_per_query, device=None): super(SampleFromModel, self).__init__() self.model = model self.table = table # The table that MADE is trained on. self.num_samples_per_query = num_samples_per_query self.device = device #device to use for pytorch doms = [c.DistributionSize() for c in table.columns] # Right shift by 1; put 0 at head. doms[1:] = doms[:-1] doms[0] = 0 self.cumsum_shifted_doms = np.cumsum(doms) print('shifted cumsum', self.cumsum_shifted_doms) def __str__(self): return 'msample_{}'.format(self.num_samples_per_query) def SampleTuples(self, num): """Samples num tuples from the MADE model""" samples = self.model.sample(num, self.device).to(torch.int32).cpu().numpy() return samples def Query(self, columns, operators, vals): columns, operators, vals = FillInUnqueriedColumns( self.table, columns, operators, vals) self.OnStart() # [N, num cols]. tuples = self.SampleTuples(self.num_samples_per_query) # all_valids: # [ (col1) T, F, F, T; (col2) F, F, T; (col3) T ] # # Samples: # [ [ 0, 2, 0 ]; [1, 1, 0] ] # # Then only the first sample satisfies the query. all_valids = [] for col, op, val in zip(columns, operators, vals): if op is not None: valid = OPS[op](col.all_distinct_values, val) else: valid = [True] * col.DistributionSize() all_valids.extend(valid) all_valids = np.asarray(all_valids) # all() along column dimension: indicates whether each sample passes. s = all_valids.take(tuples + self.cumsum_shifted_doms).all(1).sum() sel = s * 1.0 / self.num_samples_per_query self.OnEnd() return np.ceil(sel * self.table.cardinality).astype(dtype=np.int32) class Heuristic(CardEst): """Uses independence assumption.""" def __init__(self, table): super(Heuristic, self).__init__() self.table = table self.size = self.table.cardinality def __str__(self): return 'heuristic' def Query(self, columns, operators, vals): self.OnStart() sels = [ OPS[o](c.data if isinstance(c.data, np.ndarray) else c.data.values, v).sum() / self.size for c, o, v in zip(columns, operators, vals) ] sel = np.prod(sels) self.OnEnd() return np.ceil(sel * self.size).astype(np.int32) class Oracle(CardEst): """Returns true cardinalities.""" def __init__(self, table, limit_first_n=None): super(Oracle, self).__init__() self.table = table self.limit_first_n = limit_first_n def __str__(self): return 'oracle' def Query(self, columns, operators, vals, return_masks=False): assert len(columns) == len(operators) == len(vals) self.OnStart() bools = None for c, o, v in zip(columns, operators, vals): if self.limit_first_n is None: inds = OPS[o](c.data, v) else: # For data shifts experiment. inds = OPS[o](c.data[:self.limit_first_n], v) if bools is None: bools = inds else: bools &= inds c = bools.sum() self.OnEnd() if return_masks: return bools return c class QueryRegionSize(CardEst): """Returns query region size including wildcards.""" def __init__(self, table, count_wildcards=True): super().__init__() self.table = table self.count_wildcards = count_wildcards def __str__(self): return 'region_size_{}'.format(self.count_wildcards) def Query(self, columns, operators, vals, return_masks=False): columns, operators, vals = FillInUnqueriedColumns( self.table, columns, operators, vals) total_size = 1.0 for c, o, v in zip(columns, operators, vals): if o is None: if self.count_wildcards: domain_i_size = len(c.all_distinct_values) else: domain_i_size = 1.0 else: domain_i_size = OPS[o](c.all_distinct_values, v).sum() total_size *= domain_i_size return total_size class Const(CardEst): """Returns a constant.""" def __init__(self, const): super().__init__() self.const = const def __str__(self): return 'Const[{}]'.format(self.const) def Query(self, columns, operators, vals): self.OnStart() c = self.const self.OnEnd() return c class Sampling(CardEst): """Keep p% of samples in memory.""" def __init__(self, table, p): super(Sampling, self).__init__() self.table = table self.p = p self.num_samples = int(p * table.cardinality) self.size = table.cardinality # TODO: add seed for repro. self.tuples = table.data.sample(n=self.num_samples) self.name = str(self) def __str__(self): if self.p * 100 != int(self.p * 100): return 'sample_{:.1f}%'.format(self.p * 100) return 'sample_{}%'.format(int(self.p * 100)) def Query(self, columns, operators, vals): assert len(columns) == len(operators) == len(vals) self.OnStart() qualifying_tuples = [] for col, op, val in zip(columns, operators, vals): qualifying_tuples.append(OPS[op](self.tuples[col.name], val)) s = np.all(qualifying_tuples, axis=0).sum() sel = s * 1.0 / self.num_samples self.OnEnd() return np.ceil(sel * self.table.cardinality).astype(dtype=np.int32) class Postgres(CardEst): def __init__(self, database, relation, port=None): """Postgres estimator (i.e., EXPLAIN). Must have the PG server live. E.g., def MakeEstimators(): return [Postgres('dmv', 'vehicle_reg', None), ...] Args: database: string, the database name. relation: string, the relation name. port: int, the port. """ import psycopg2 super(Postgres, self).__init__() self.conn = psycopg2.connect(database=database, port=port) self.conn.autocommit = True self.cursor = self.conn.cursor() self.cursor.execute('analyze ' + relation + ';') self.conn.commit() self.database = database self.relation = relation def __str__(self): return 'postgres' def Query(self, columns, operators, vals): assert len(columns) == len(operators) == len(vals) pred = QueryToPredicate(columns, operators, vals) # Use json so it's easier to parse. query_s = 'explain(format json) select * from ' + self.relation + pred # print(query_s) self.OnStart() self.cursor.execute(query_s) res = self.cursor.fetchall() # print(res) result = res[0][0][0]['Plan']['Plan Rows'] self.OnEnd() return result def QueryByExec(self, columns, operators, vals): # Runs actual query on postgres and returns true cardinality. assert len(columns) == len(operators) == len(vals) pred = QueryToPredicate(columns, operators, vals) query_s = 'select count(*) from ' + self.relation + pred self.string = query_s self.cursor.execute(query_s) result = self.cursor.fetchone()[0] return result def Close(self): self.cursor.close() self.conn.close() class BayesianNetwork(CardEst): """Progressive sampling with a pomegranate bayes net.""" def build_discrete_mapping(self, table, discretize, discretize_method): assert discretize_method in ["equal_size", "equal_freq"], discretize_method self.max_val = collections.defaultdict(lambda: None) if not discretize: return {} table = table.copy() mapping = {} for col_id in range(len(table[0])): col = table[:, col_id] if max(col) > discretize: if discretize_method == "equal_size": denom = (max(col) + 1) / discretize fn = lambda v: np.floor(v / denom) elif discretize_method == "equal_freq": per_bin = len(col) // discretize counts = collections.defaultdict(int) for x in col: counts[int(x)] += 1 assignments = {} i = 0 bin_size = 0 for k, count in sorted(counts.items()): if bin_size > 0 and bin_size + count >= per_bin: bin_size = 0 i += 1 assignments[k] = i self.max_val[col_id] = i bin_size += count assignments = np.array( [assignments[i] for i in range(int(max(col) + 1))]) def capture(assignments): def fn(v): return assignments[v.astype(np.int32)] return fn fn = capture(assignments) else: assert False mapping[col_id] = fn return mapping def apply_discrete_mapping(self, table, discrete_mapping): table = table.copy() for col_id in range(len(table[0])): if col_id in discrete_mapping: fn = discrete_mapping[col_id] table[:, col_id] = fn(table[:, col_id]) return table def apply_discrete_mapping_to_value(self, value, col_id, discrete_mapping): if col_id not in discrete_mapping: return value return discrete_mapping[col_id](value) def __init__(self, dataset, num_samples, algorithm="greedy", max_parents=-1, topological_sampling_order=True, use_pgm=True, discretize=None, discretize_method="equal_size", root=None): CardEst.__init__(self) from pomegranate import BayesianNetwork self.discretize = discretize self.discretize_method = discretize_method self.dataset = dataset self.original_table = self.dataset.tuples.numpy() self.algorithm = algorithm self.topological_sampling_order = topological_sampling_order self.num_samples = num_samples self.discrete_mapping = self.build_discrete_mapping( self.original_table, discretize, discretize_method) self.discrete_table = self.apply_discrete_mapping( self.original_table, self.discrete_mapping) print('calling BayesianNetwork.from_samples...', end='') t = time.time() self.model = BayesianNetwork.from_samples(self.discrete_table, algorithm=self.algorithm, max_parents=max_parents, n_jobs=8, root=root) print('done, took', time.time() - t, 'secs.') def size(states): n = 0 for state in states: if "distribution" in state: dist = state["distribution"] else: dist = state if dist["name"] == "DiscreteDistribution": for p in dist["parameters"]: n += len(p) elif dist["name"] == "ConditionalProbabilityTable": for t in dist["table"]: n += len(t) if "parents" in dist: for parent in dist["parents"]: n += size(dist["parents"]) else: assert False, dist["name"] return n self.size = 4 * size(json.loads(self.model.to_json())["states"]) # print('json:\n', self.model.to_json()) self.json_size = len(self.model.to_json()) self.use_pgm = use_pgm # print(self.model.to_json()) if topological_sampling_order: self.sampling_order = [] while len(self.sampling_order) < len(self.model.structure): for i, deps in enumerate(self.model.structure): if i in self.sampling_order: continue # already ordered if all(d in self.sampling_order for d in deps): self.sampling_order.append(i) print("Building sampling order", self.sampling_order) else: self.sampling_order = list(range(len(self.model.structure))) print("Using sampling order", self.sampling_order, str(self)) if use_pgm: from pgmpy.models import BayesianModel data = pd.DataFrame(self.discrete_table.astype(np.int64)) spec = [] orphans = [] for i, parents in enumerate(self.model.structure): for p in parents: spec.append((p, i)) if not parents: orphans.append(i) print("Model spec", spec) model = BayesianModel(spec) for o in orphans: model.add_node(o) print('calling pgm.BayesianModel.fit...', end='') t = time.time() model.fit(data) print('done, took', time.time() - t, 'secs.') self.pgm_model = model def __str__(self): return "bn-{}-{}-{}-{}-bytes-{}-{}-{}".format( self.algorithm, self.num_samples, "topo" if self.topological_sampling_order else "nat", self.size, # self.json_size, self.discretize, self.discretize_method if self.discretize else "na", "pgmpy" if self.use_pgm else "pomegranate") def Query(self, columns, operators, vals): if len(columns) != len(self.dataset.table.columns): columns, operators, vals = FillInUnqueriedColumns( self.dataset.table, columns, operators, vals) self.OnStart() ncols = len(columns) nrows = self.discrete_table.shape[0] assert ncols == self.discrete_table.shape[1], ( ncols, self.discrete_table.shape) def draw_conditional_pgm(evidence, col_id): """PGM version of draw_conditional()""" if operators[col_id] is None: op = None val = None else: op = OPS[operators[col_id]] val = self.apply_discrete_mapping_to_value( self.dataset.table.val_to_bin_funcs[col_id](vals[col_id]), col_id, self.discrete_mapping) if self.discretize: # avoid some bad cases if val == 0 and operators[col_id] == "<": val += 1 elif val == self.max_val[col_id] and operators[ col_id] == ">": val -= 1 def prob_match(distribution): if not op: return 1. p = 0. for k, v in enumerate(distribution): if op(k, val): p += v return p from pgmpy.inference import VariableElimination model_inference = VariableElimination(self.pgm_model) xi_distribution = [] for row in evidence: e = {} for i, v in enumerate(row): if v is not None: e[i] = v result = model_inference.query(variables=[col_id], evidence=e) xi_distribution.append(result[col_id].values) xi_marginal = [prob_match(d) for d in xi_distribution] filtered_distributions = [] for d in xi_distribution: keys = [] prob = [] for k, p in enumerate(d): if not op or op(k, val): keys.append(k) prob.append(p) denominator = sum(prob) if denominator == 0: prob = [1. for _ in prob] # doesn't matter if len(prob) == 0: prob = [1.] keys = [0.] prob = np.array(prob) / sum(prob) filtered_distributions.append((keys, prob)) xi_samples = [ np.random.choice(k, p=v) for k, v in filtered_distributions ] return xi_marginal, xi_samples def draw_conditional(evidence, col_id): """Draws a new value x_i for the column, and returns P(x_i|prev). Arguments: evidence: shape [BATCH, ncols] with None for unknown cols col_id: index of the current column, i Returns: xi_marginal: P(x_i|x0...x_{i-1}), computed by marginalizing across the range constraint match_rows: the subset of rows from filtered_rows that also satisfy the predicate at column i. """ if operators[col_id] is None: op = None val = None else: op = OPS[operators[col_id]] val = self.apply_discrete_mapping_to_value( self.dataset.table.val_to_bin_funcs[col_id](vals[col_id]), col_id, self.discrete_mapping) if self.discretize: # avoid some bad cases if val == 0 and operators[col_id] == "<": val += 1 elif val == self.max_val[col_id] and operators[ col_id] == ">": val -= 1 def prob_match(distribution): if not op: return 1. p = 0. for k, v in distribution.items(): if op(k, val): p += v return p xi_distribution = self.model.predict_proba(evidence, max_iterations=1, n_jobs=-1) xi_marginal = [ prob_match(d[col_id].parameters[0]) for d in xi_distribution ] filtered_distributions = [] for d in xi_distribution: keys = [] prob = [] for k, p in d[col_id].parameters[0].items(): if not op or op(k, val): keys.append(k) prob.append(p) denominator = sum(prob) if denominator == 0: prob = [1. for _ in prob] # doesn't matter if len(prob) == 0: prob = [1.] keys = [0.] prob = np.array(prob) / sum(prob) filtered_distributions.append((keys, prob)) xi_samples = [ np.random.choice(k, p=v) for k, v in filtered_distributions ] return xi_marginal, xi_samples p_estimates = [1. for _ in range(self.num_samples)] evidence = [[None] * ncols for _ in range(self.num_samples)] for col_id in self.sampling_order: if self.use_pgm: xi_marginal, xi_samples = draw_conditional_pgm(evidence, col_id) else: xi_marginal, xi_samples = draw_conditional(evidence, col_id) for ev_list, xi in zip(evidence, xi_samples): ev_list[col_id] = xi for i in range(self.num_samples): p_estimates[i] *= xi_marginal[i] self.OnEnd() return int(np.mean(p_estimates) * nrows) class MaxDiffHistogram(CardEst): """MaxDiff n-dimensional histogram.""" def __init__(self, table, limit): super(MaxDiffHistogram, self).__init__() self.table = table self.limit = limit self.partitions = [] self.maxdiff = {} self.partition_to_maxdiff = {} self.num_new_partitions = 2 # map<cid, map<bound_type, map<bound_value, list(partition id)>>> self.column_bound_map = {} for cid in range(len(self.table.columns)): self.column_bound_map[cid] = {} self.column_bound_map[cid]['l'] = {} self.column_bound_map[cid]['u'] = {} # map<cid, map<bound_type, sorted_list(bound_value)>> self.column_bound_index = {} for cid in range(len(self.table.columns)): self.column_bound_index[cid] = {} self.column_bound_index[cid]['l'] = [] self.column_bound_index[cid]['u'] = [] self.name = str(self) print('Building MaxDiff histogram, may take a while...') self._build_histogram() def __str__(self): return 'maxdiff[{}]'.format(self.limit) class Partition(object): def __init__(self): # a list of tuples (low, high) self.boundaries = [] # a list of row id that belongs to this partition self.data_points = [] # per-column uniform spread self.uniform_spreads = [] # per-distinct value density self.density = None self.col_value_list = {} self.rowid_to_position = {} def Size(self): total_size = 0 for _ in self.uniform_spreads: total_size += (4 * len(_)) total_size += 4 return total_size def _compute_maxdiff(self, partition): for col in range(len(partition.boundaries)): vals = partition.col_value_list[col] counter_start = time.time() counter = pd.Series(vals).value_counts().sort_index() # compute Diff(V, A) spread = counter.index[1:] - counter.index[:-1] spread_m_counts = spread * counter.iloc[:-1] maxdiff = 0 if len(spread_m_counts) > 0: maxdiff = max(spread_m_counts.values.max(), 0) if maxdiff not in self.maxdiff: self.maxdiff[maxdiff] = [(partition, col)] else: self.maxdiff[maxdiff].append((partition, col)) self.partition_to_maxdiff[partition].add(maxdiff) def _build_histogram(self): # initial partition p = self.Partition() # populate initial boundary for cid in range(len(self.table.columns)): if not self.table.columns[cid].data.dtype == 'int64': p.boundaries.append( (0, self.table.columns[cid].distribution_size, True)) else: p.boundaries.append((min(self.table.columns[cid].data), max(self.table.columns[cid].data), True)) # include all rowids self.table_ds = common.TableDataset(self.table) num_rows = self.table.cardinality p.data_points = np.arange(num_rows) for cid in range(len(p.boundaries)): if not self.table.columns[cid].data.dtype == 'int64': p.col_value_list[cid] = self.table_ds.tuples_np[:, cid] else: p.col_value_list[cid] = self.table.columns[cid].data[:, cid] p.rowid_to_position = list(np.arange(num_rows)) self.partition_to_maxdiff[p] = set() self._compute_maxdiff(p) self.partitions.append(p) while len(self.partitions) < self.limit: start_next_partition = time.time() (split_partition_index, split_column_index, partition_boundaries, global_maxdiff) = self.next_partition_candidate( self.partitions, len(self.table.columns), self.table, min(self.num_new_partitions, self.limit - len(self.partitions) + 1), self.maxdiff) print('determining partition number ', len(self.partitions)) if global_maxdiff == 0: print('maxdiff already 0 before reaching bucket limit') break start_generate_next_partition = time.time() new_partitions = self.generate_new_partitions( self.partitions[split_partition_index], split_column_index, partition_boundaries) for p in new_partitions: self.partition_to_maxdiff[p] = set() self._compute_maxdiff(p) for d in self.partition_to_maxdiff[ self.partitions[split_partition_index]]: remove_set = set() for cid in range(len(self.table.columns)): remove_set.add( (self.partitions[split_partition_index], cid)) for tp in remove_set: if tp in self.maxdiff[d]: self.maxdiff[d].remove(tp) if len(self.maxdiff[d]) == 0: del self.maxdiff[d] del self.partition_to_maxdiff[ self.partitions[split_partition_index]] self.partitions.pop(split_partition_index) self.partitions += new_partitions # finish partitioning, for each partition we condense its info and # compute uniform spread. total_point = 0 for pid, partition in enumerate(self.partitions): total_point += len(partition.data_points) total = len(partition.data_points) total_distinct = 1 for cid, boundary in enumerate(partition.boundaries): distinct = len( set(self.table.columns[cid].data[rowid] for rowid in partition.data_points)) if distinct == 1: if not self.table.columns[cid].data.dtype == 'int64': partition.uniform_spreads.append([ list( set(self.table_ds. tuples_np[partition.data_points, cid]))[0] ]) else: partition.uniform_spreads.append([ list( set(self.table.columns[cid].data[ partition.data_points]))[0] ]) else: uniform_spread = None spread_length = None if boundary[2]: spread_length = float( (boundary[1] - boundary[0])) / (distinct - 1) uniform_spread = [boundary[0]] else: spread_length = float( (boundary[1] - boundary[0])) / (distinct) uniform_spread = [boundary[0] + spread_length] for _ in range(distinct - 2): uniform_spread.append(uniform_spread[-1] + spread_length) uniform_spread.append(boundary[1]) partition.uniform_spreads.append(uniform_spread) total_distinct = total_distinct * distinct partition.density = float(total) / total_distinct print('total number of point is ', total_point) # populate column bound map and list metadata for cid in range(len(self.table.columns)): for pid, partition in enumerate(self.partitions): if partition.boundaries[cid][0] not in self.column_bound_map[ cid]['l']: self.column_bound_map[cid]['l'][partition.boundaries[cid] [0]] = [pid] else: self.column_bound_map[cid]['l'][partition.boundaries[cid] [0]].append(pid) if partition.boundaries[cid][1] not in self.column_bound_map[ cid]['u']: self.column_bound_map[cid]['u'][partition.boundaries[cid] [1]] = [pid] else: self.column_bound_map[cid]['u'][partition.boundaries[cid] [1]].append(pid) self.column_bound_index[cid]['l'].append( partition.boundaries[cid][0]) self.column_bound_index[cid]['u'].append( partition.boundaries[cid][1]) self.column_bound_index[cid]['l'] = sorted( set(self.column_bound_index[cid]['l'])) self.column_bound_index[cid]['u'] = sorted( set(self.column_bound_index[cid]['u'])) def next_partition_candidate(self, partitions, column_number, table, num_new_partitions, maxdiff_map): global_maxdiff = max(sorted(maxdiff_map.keys())) partition, cid = maxdiff_map[global_maxdiff][0] vals = partition.col_value_list[cid] counter = collections.Counter(vals) first_key = True prev_key = None diff = [] for key in sorted(counter.keys()): if first_key: first_key = False prev_key = key else: spread = key - prev_key diff.append((prev_key, (spread * counter[prev_key]))) prev_key = key diff.append((prev_key, (0 * counter[prev_key]))) partition_boundaries = sorted( list( tp[0] for tp in sorted(diff, key=operator.itemgetter( 1), reverse=True)[:min(num_new_partitions - 1, len(diff))])) return (partitions.index(partition), cid, partition_boundaries, global_maxdiff) def generate_new_partitions(self, partition, partition_column_index, partition_boundaries): new_partitions = [] for i in range(len(partition_boundaries) + 1): new_partition = self.Partition() for cid, boundary in enumerate(partition.boundaries): if not cid == partition_column_index: new_partition.boundaries.append(boundary) else: if i == 0: new_partition.boundaries.append( (boundary[0], partition_boundaries[i], boundary[2])) elif i == len(partition_boundaries): new_partition.boundaries.append( (partition_boundaries[i - 1], boundary[1], False)) else: new_partition.boundaries.append( (partition_boundaries[i - 1], partition_boundaries[i], False)) new_partitions.append(new_partition) # distribute data points to new partitions for rowid in partition.data_points: if not self.table.columns[ partition_column_index].data.dtype == 'int64': val = self.table_ds.tuples_np[rowid, partition_column_index] else: val = self.table.columns[partition_column_index].data[rowid] # find the new partition that the row belongs to new_partitions[bisect.bisect_left(partition_boundaries, val)].data_points.append(rowid) # debug start = time.time() for new_partition in new_partitions: for cid in range(len(new_partition.boundaries)): new_partition.col_value_list[cid] = [ partition.col_value_list[cid][ partition.rowid_to_position[rowid]] for rowid in new_partition.data_points ] pos = 0 for rowid in new_partition.data_points: new_partition.rowid_to_position[rowid] = pos pos += 1 if len(new_partition.data_points) == 0: print('found partition with no data!') print(sorted( list(self.table.columns[partition_column_index].data[rowid] for rowid in partition.data_points))) print(partition_boundaries) return new_partitions def _populate_column_set_map(self, c, o, v, column_set_map): cid = self.table.ColumnIndex(c.name) column_set_map[cid] = set() if o in ['<', '<=']: insert_index = None if o == '<': insert_index = bisect.bisect_left( self.column_bound_index[cid]['l'], v) for i in range(insert_index): column_set_map[cid] = column_set_map[cid].union( self.column_bound_map[cid]['l'][ self.column_bound_index[cid]['l'][i]]) else: insert_index = bisect.bisect(self.column_bound_index[cid]['l'], v) for i in range(insert_index): if self.column_bound_index[cid]['l'][i] == v: for pid in self.column_bound_map[cid]['l'][v]: if self.partitions[pid].boundaries[cid][2]: # add only when the lower bound is inclusive column_set_map[cid].add(pid) else: column_set_map[cid] = column_set_map[cid].union( self.column_bound_map[cid]['l'][ self.column_bound_index[cid]['l'][i]]) elif o in ['>', '>=']: insert_index = None if o == '>': insert_index = bisect.bisect(self.column_bound_index[cid]['u'], v) else: insert_index = bisect.bisect_left( self.column_bound_index[cid]['u'], v) for i in range(insert_index, len(self.column_bound_index[cid]['u'])): column_set_map[cid] = column_set_map[cid].union( self.column_bound_map[cid]['u'][self.column_bound_index[cid] ['u'][i]]) else: assert o == '=', o lower_bound_set = set() insert_index = bisect.bisect(self.column_bound_index[cid]['l'], v) for i in range(insert_index): if self.column_bound_index[cid]['l'][i] == v: for pid in self.column_bound_map[cid]['l'][v]: if self.partitions[pid].boundaries[cid][2]: # add only when the lower bound is inclusive lower_bound_set.add(pid) else: lower_bound_set = lower_bound_set.union( self.column_bound_map[cid]['l'][ self.column_bound_index[cid]['l'][i]]) upper_bound_set = set() insert_index = bisect.bisect_left(self.column_bound_index[cid]['u'], v) for i in range(insert_index, len(self.column_bound_index[cid]['u'])): upper_bound_set = upper_bound_set.union( self.column_bound_map[cid]['u'][self.column_bound_index[cid] ['u'][i]]) column_set_map[cid] = lower_bound_set.intersection(upper_bound_set) def _estimate_cardinality_per_partition(self, partition, columns, operators, vals): distinct_val_covered = 1 observed_cid = [] for c, o, v in zip(columns, operators, vals): if not c.data.dtype == 'int64': v = c.ValToBin(v) cid = self.table.ColumnIndex(c.name) observed_cid.append(cid) spread = partition.uniform_spreads[cid] if o in ['<', '<=']: if o == '<': distinct_val_covered = distinct_val_covered * bisect.bisect_left( spread, v) else: distinct_val_covered = distinct_val_covered * bisect.bisect( spread, v) elif o in ['>', '>=']: if o == '>': distinct_val_covered = distinct_val_covered * ( len(spread) - bisect.bisect(spread, v)) else: distinct_val_covered = distinct_val_covered * ( len(spread) - bisect.bisect_left(spread, v)) else: assert o == '=', o if not v in spread: distinct_val_covered = 0 for cid in range(len(partition.uniform_spreads)): if not cid in observed_cid: distinct_val_covered = distinct_val_covered * len( partition.uniform_spreads[cid]) return distinct_val_covered * partition.density def Query(self, columns, operators, vals): self.OnStart() # map<cid, set(pids)> column_set_map = {} for c, o, v in zip(columns, operators, vals): if not c.data.dtype == 'int64': v = c.ValToBin(v) self._populate_column_set_map(c, o, v, column_set_map) # compute the set of pids that's relevant to this query relevant_pids = set() first = True for cid in column_set_map: if first: relevant_pids = column_set_map[cid] first = False else: relevant_pids = relevant_pids.intersection(column_set_map[cid]) total_card = 0 # for each pid, check full or partial coverage # if full coverage, just add the full count # otherwise, estimate the partial sum with uniform spread assumption for pid in relevant_pids: total_card += self._estimate_cardinality_per_partition( self.partitions[pid], columns, operators, vals) self.OnEnd() return total_card def Size(self): total_size = 15 * 2 * 4 for p in self.partitions: total_size += p.Size() total_size += 24 * (len(self.partitions) - 1) return total_size