 * Copyright (C) 2017-2019 Dremio Corporation
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package com.dremio.service.reflection.analysis;

import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

import org.apache.calcite.linq4j.Ord;
import org.apache.calcite.sql.type.SqlTypeName;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.dremio.common.utils.SqlUtils;
import com.dremio.exec.util.ViewFieldsHelper;
import com.dremio.service.accelerator.AccelerationUtils;
import com.dremio.service.namespace.dataset.proto.DatasetConfig;
import com.dremio.service.namespace.dataset.proto.ViewFieldType;
import com.dremio.service.reflection.ReflectionValidator;
import com.dremio.service.reflection.analysis.ReflectionAnalyzer.ColumnStats;
import com.dremio.service.reflection.analysis.ReflectionAnalyzer.RField;
import com.dremio.service.reflection.analysis.ReflectionAnalyzer.TableStats;
import com.dremio.service.reflection.proto.DimensionGranularity;
import com.dremio.service.reflection.proto.ReflectionDetails;
import com.dremio.service.reflection.proto.ReflectionDimensionField;
import com.dremio.service.reflection.proto.ReflectionField;
import com.dremio.service.reflection.proto.ReflectionGoal;
import com.dremio.service.reflection.proto.ReflectionMeasureField;
import com.dremio.service.reflection.proto.ReflectionType;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;

 * Suggest reflections
public class ReflectionSuggester {
  private static final Logger logger = LoggerFactory.getLogger(ReflectionSuggester.class);

  private static final Comparator<ColumnStats> COLUMN_RANKING = new Comparator<ColumnStats>() {
    public int compare(final ColumnStats left, final ColumnStats right) {
      return Long.compare(

  private static final Function<ColumnStats, ReflectionField> TO_REFLECTION_FIELD = new Function<ColumnStats, ReflectionField>() {
    public ReflectionField apply(final ColumnStats columnStats) {
      return new ReflectionField(columnStats.getField().getName());

  private static final Predicate<RField> MEASURE_TYPE_FILTER = new Predicate<RField>() {
    public boolean apply(@Nullable final RField input) {
      return TypeUtils.isNumeric(input);

  private static final Predicate<ColumnStats> DIMENSION_TYPE_FILTER = new Predicate<ColumnStats>() {
    public boolean apply(@Nullable final ColumnStats columnStats) {
      return columnStats.getMaxLength() <= MAX_DIMENSION_FIELD_LENGTH && !TypeUtils.isComplex(columnStats.getField());

  // ratio of dimension fields to all fields
  private static final double DIMENSION_FIELDS_RATIO = 1;
  // maximum number of measure fields to discover
  private static final int MAX_MEASURE_FIELDS = 30;
  // ratio of measure fields to all fields
  private static final double MEASURE_FIELDS_RATIO = .9;
  // max field length to be considered as dimension or measure column
  private static final int MAX_DIMENSION_FIELD_LENGTH = 50;
  //Setup an upper limit for cartesian product 2 pow 31 = 1 TB assuming 500  bytes for a row
  private static final long CARTESIAN_CARDINALITY_UPPER_LIMIT = 2L << 30;

  private final DatasetConfig datasetConfig;
  private final List<ColumnStats> columnStats;
  private final Long count;

  public ReflectionSuggester(DatasetConfig datasetConfig, TableStats tableStats) {
    this.datasetConfig = datasetConfig;
    this.columnStats = tableStats.getColumns();
    this.count = tableStats.getCount();

  public List<ReflectionGoal> getReflectionGoals() {
    List<ReflectionGoal> rawGoals = Lists.transform(Ord.zip(getRawReflections()), new Function<Ord<ReflectionDetails>, ReflectionGoal>() {
      public ReflectionGoal apply(Ord<ReflectionDetails> reflectionDetails) {
        return new ReflectionGoal()
          .setName(String.format("AUTO_%s_RAW_%d", SqlUtils.quotedCompound(datasetConfig.getFullPathList()), reflectionDetails.i))
    List<ReflectionGoal> aggGoals = Lists.transform(Ord.zip(getAggReflections()), new Function<Ord<ReflectionDetails>, ReflectionGoal>() {
      public ReflectionGoal apply(Ord<ReflectionDetails> reflectionDetails) {
        return new ReflectionGoal()
          .setName(String.format("AUTO_%s_AGG_%d", SqlUtils.quotedCompound(datasetConfig.getFullPathList()), reflectionDetails.i))

    return FluentIterable.from(rawGoals).append(aggGoals).toList();

  private List<ReflectionDetails> getAggReflections() {
    List<ColumnStats> columns = columnStats;
    if (columns.isEmpty()) {
      return Collections.emptyList();

    final int columnCount = columns.size();
    final int measureLimit = Math.min(MAX_MEASURE_FIELDS, Math.max(1, (int) (columnCount * MEASURE_FIELDS_RATIO)));

    AnalysisSummary analysisSummary = AnalysisSummary.of(columns, count);

    final Map<String, ViewFieldType> schema = FluentIterable
      .uniqueIndex(new Function<ViewFieldType, String>() {
        public String apply(final ViewFieldType input) {
          return input.getName();

    // create a ranking based on stats
    final List<ColumnStats> candidates = FluentIterable

    final List<ColumnStats> dimension = FluentIterable
      .filter(new Predicate<ColumnStats>() {
        public boolean apply(final ColumnStats columnStats) {
          return DIMENSION_TYPE_FILTER.apply(columnStats);
      .filter(new Predicate<ColumnStats>() {
        public boolean apply(final ColumnStats columnStats) {

          final RField columnRField = columnStats.getField();
          final String name = columnRField.getName();
          final ViewFieldType fieldType = schema.get(name);
          final SqlTypeName sqlTypeName = SqlTypeName.get(fieldType.getType());
          boolean result = true;

            * DX-7524: Some types from NUMERIC family (DECIMAL, FLOATs and not INTs) should never be
            * considered as DIMENSIONS. These should always be MEASURES. For other NUMERIC
            * types (INTEGER, TINYINT, SMALLINT, BIGINT), the existing logic of checking
            * cardinality and cartesian product remains unchanged.
          if(TypeUtils.isNumeric(columnRField)) {
            switch(sqlTypeName) {
            case DECIMAL:
            case FLOAT:
            case REAL:
            case DOUBLE:
              result = false;
              result = true;

          return result;

    final List<ColumnStats> measure = FluentIterable
        .filter(new Predicate<ColumnStats>() {
          public boolean apply(final ColumnStats columnStats) {
            return MEASURE_TYPE_FILTER.apply(columnStats.getField());

    // generate aggregation suggestions
    Optional<AggregationDescriptor> aggregation = generate(dimension, measure, analysisSummary.getCount());

    if (aggregation.isPresent()) {
      return FluentIterable
          .transform(new Function<AggregationDescriptor, ReflectionDetails>() {
            public ReflectionDetails apply(final AggregationDescriptor input) {
              return new ReflectionDetails()
                    stats -> new ReflectionMeasureField(stats.getField().getName()).setMeasureTypeList(ReflectionValidator.getDefaultMeasures(stats.getField().getTypeFamily()))

    return Collections.emptyList();

   * Suggests raw reflection.
   * <p>
   * Current implementation is simply a pass through.
  private List<ReflectionDetails> getRawReflections() {
    return ImmutableList.of(new ReflectionDetails()
            new Function<ColumnStats, ReflectionField>() {
              public ReflectionField apply(ColumnStats column) {
                return new ReflectionField(column.getField().getName());

   * Generates a single aggregation.
   * <p>
   * This algorithm is heuristic based so there no guarantees as to find the optimal solution. The resulting plan
   * should satisfy the following:
   * <p>
   * (1) there is at least a dimension column
   * (2) cardinality of each dimension should be less than square root of max cardinality(except if there is only one dimension)
   * (3) cartesian product should be less than 2 Billion (assuming 500 bytes for each row, this is 1 TB)
  protected Optional<AggregationDescriptor> generate(final List<ColumnStats> dimensions,
      final List<ColumnStats> measures, Long count) {
    final List<ColumnStats> dimensionFields = Lists.newArrayList();

    if (!dimensions.isEmpty()) {
      //add the first one anyway
      long currentCardinalityProduct = dimensions.get(0).getCardinality();

      double cardinalityLimit  = (Optional.fromNullable(count).or(100_000L)) * .01;

      for (int i = 1; i < dimensions.size(); i++) {
        final ColumnStats field = dimensions.get(i);
        long newCardinalityProduct = currentCardinalityProduct * field.getCardinality();

        if (field.getCardinality() <= cardinalityLimit && newCardinalityProduct <= CARTESIAN_CARDINALITY_UPPER_LIMIT) {
          currentCardinalityProduct = newCardinalityProduct;

      final List<ColumnStats> measureFields = FluentIterable
          .filter(new Predicate<ColumnStats>() {
            public boolean apply(final ColumnStats columnStats) {
              return !dimensionFields.contains(columnStats);
      return Optional.of(new  AggregationDescriptor(ImmutableList.copyOf(dimensionFields), measureFields));
    return Optional.absent();

  private static List<ReflectionDimensionField> toReflectionDimensionFields(final Iterable<ColumnStats> columns) {
    return FluentIterable.from(columns)
        .transform(new Function<ColumnStats, ReflectionDimensionField>() {
          public ReflectionDimensionField apply(final ColumnStats columnStats) {
            return new ReflectionDimensionField()

  static List<ReflectionField> toReflectionFields(final Iterable<ColumnStats> columns) {
    return FluentIterable.from(columns)

  // utility classes

   * Returns a truncated view root that is the bottom part of the plan after splitting input view from aggregation.
  protected static class AggregationDescriptor {
    private final List<ColumnStats> dimensions;
    private final List<ColumnStats> measures;

    public AggregationDescriptor(final List<ColumnStats> dimensions, final List<ColumnStats> measures) {
      this.dimensions = Preconditions.checkNotNull(dimensions, "dimensions are required");
      this.measures = Preconditions.checkNotNull(measures, "measures are required");
      Preconditions.checkArgument(!dimensions.isEmpty(), "dimensions cannot be empty");

    public List<ColumnStats> getMeasures() {
      return measures;

    public List<ColumnStats> getDimensions() {
      return dimensions;

   * Summary of dataset analysis
  private static final class AnalysisSummary {
    private final List<ColumnStats> analysis;
    private final long count;

    public AnalysisSummary(final List<ColumnStats> analysis, final long count) {
      this.analysis = analysis;
      this.count = count;

    public List<ColumnStats> getAnalysis() {
      return analysis;

    public long getCount() {
      return count;

    public static AnalysisSummary of(final List<ColumnStats> columnList, Long count) {
      final List<ColumnStats> columns = AccelerationUtils.selfOrEmpty(columnList);

      if (columns.isEmpty()) {
        return new AnalysisSummary(columnList, count);

      return new AnalysisSummary(columnList, count);