/* * Copyright (C) 2017-2019 Dremio Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.dremio.service.reflection.analysis; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import javax.annotation.Nullable; import org.apache.calcite.linq4j.Ord; import org.apache.calcite.sql.type.SqlTypeName; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.dremio.common.utils.SqlUtils; import com.dremio.exec.util.ViewFieldsHelper; import com.dremio.service.accelerator.AccelerationUtils; import com.dremio.service.namespace.dataset.proto.DatasetConfig; import com.dremio.service.namespace.dataset.proto.ViewFieldType; import com.dremio.service.reflection.ReflectionValidator; import com.dremio.service.reflection.analysis.ReflectionAnalyzer.ColumnStats; import com.dremio.service.reflection.analysis.ReflectionAnalyzer.RField; import com.dremio.service.reflection.analysis.ReflectionAnalyzer.TableStats; import com.dremio.service.reflection.proto.DimensionGranularity; import com.dremio.service.reflection.proto.ReflectionDetails; import com.dremio.service.reflection.proto.ReflectionDimensionField; import com.dremio.service.reflection.proto.ReflectionField; import com.dremio.service.reflection.proto.ReflectionGoal; import com.dremio.service.reflection.proto.ReflectionMeasureField; import com.dremio.service.reflection.proto.ReflectionType; import com.google.common.base.Function; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; /** * Suggest reflections */ public class ReflectionSuggester { private static final Logger logger = LoggerFactory.getLogger(ReflectionSuggester.class); private static final Comparator<ColumnStats> COLUMN_RANKING = new Comparator<ColumnStats>() { @Override public int compare(final ColumnStats left, final ColumnStats right) { return Long.compare( Optional.fromNullable(left.getCardinality()).or(Long.MAX_VALUE), Optional.fromNullable(right.getCardinality()).or(Long.MAX_VALUE) ); } }; private static final Function<ColumnStats, ReflectionField> TO_REFLECTION_FIELD = new Function<ColumnStats, ReflectionField>() { @Override public ReflectionField apply(final ColumnStats columnStats) { return new ReflectionField(columnStats.getField().getName()); } }; private static final Predicate<RField> MEASURE_TYPE_FILTER = new Predicate<RField>() { @Override public boolean apply(@Nullable final RField input) { return TypeUtils.isNumeric(input); } }; private static final Predicate<ColumnStats> DIMENSION_TYPE_FILTER = new Predicate<ColumnStats>() { @Override public boolean apply(@Nullable final ColumnStats columnStats) { return columnStats.getMaxLength() <= MAX_DIMENSION_FIELD_LENGTH && !TypeUtils.isComplex(columnStats.getField()); } }; // ratio of dimension fields to all fields private static final double DIMENSION_FIELDS_RATIO = 1; // maximum number of measure fields to discover private static final int MAX_MEASURE_FIELDS = 30; // ratio of measure fields to all fields private static final double MEASURE_FIELDS_RATIO = .9; // max field length to be considered as dimension or measure column private static final int MAX_DIMENSION_FIELD_LENGTH = 50; //Setup an upper limit for cartesian product 2 pow 31 = 1 TB assuming 500 bytes for a row private static final long CARTESIAN_CARDINALITY_UPPER_LIMIT = 2L << 30; private final DatasetConfig datasetConfig; private final List<ColumnStats> columnStats; private final Long count; public ReflectionSuggester(DatasetConfig datasetConfig, TableStats tableStats) { this.datasetConfig = datasetConfig; this.columnStats = tableStats.getColumns(); this.count = tableStats.getCount(); } public List<ReflectionGoal> getReflectionGoals() { List<ReflectionGoal> rawGoals = Lists.transform(Ord.zip(getRawReflections()), new Function<Ord<ReflectionDetails>, ReflectionGoal>() { @Override public ReflectionGoal apply(Ord<ReflectionDetails> reflectionDetails) { return new ReflectionGoal() .setName(String.format("AUTO_%s_RAW_%d", SqlUtils.quotedCompound(datasetConfig.getFullPathList()), reflectionDetails.i)) .setDetails(reflectionDetails.e) .setType(ReflectionType.RAW); } }); List<ReflectionGoal> aggGoals = Lists.transform(Ord.zip(getAggReflections()), new Function<Ord<ReflectionDetails>, ReflectionGoal>() { @Override public ReflectionGoal apply(Ord<ReflectionDetails> reflectionDetails) { return new ReflectionGoal() .setName(String.format("AUTO_%s_AGG_%d", SqlUtils.quotedCompound(datasetConfig.getFullPathList()), reflectionDetails.i)) .setDetails(reflectionDetails.e) .setType(ReflectionType.AGGREGATION); } }); return FluentIterable.from(rawGoals).append(aggGoals).toList(); } private List<ReflectionDetails> getAggReflections() { List<ColumnStats> columns = columnStats; if (columns.isEmpty()) { return Collections.emptyList(); } final int columnCount = columns.size(); final int measureLimit = Math.min(MAX_MEASURE_FIELDS, Math.max(1, (int) (columnCount * MEASURE_FIELDS_RATIO))); AnalysisSummary analysisSummary = AnalysisSummary.of(columns, count); final Map<String, ViewFieldType> schema = FluentIterable .from(Optional.fromNullable(ViewFieldsHelper.getViewFields(datasetConfig)).or(Collections.<ViewFieldType>emptyList())) .uniqueIndex(new Function<ViewFieldType, String>() { @Override public String apply(final ViewFieldType input) { return input.getName(); } }); // create a ranking based on stats final List<ColumnStats> candidates = FluentIterable .from(columns) .toSortedList(COLUMN_RANKING); final List<ColumnStats> dimension = FluentIterable .from(candidates) .filter(new Predicate<ColumnStats>() { @Override public boolean apply(final ColumnStats columnStats) { return DIMENSION_TYPE_FILTER.apply(columnStats); } }) .filter(new Predicate<ColumnStats>() { @Override public boolean apply(final ColumnStats columnStats) { final RField columnRField = columnStats.getField(); final String name = columnRField.getName(); final ViewFieldType fieldType = schema.get(name); final SqlTypeName sqlTypeName = SqlTypeName.get(fieldType.getType()); boolean result = true; /* * DX-7524: Some types from NUMERIC family (DECIMAL, FLOATs and not INTs) should never be * considered as DIMENSIONS. These should always be MEASURES. For other NUMERIC * types (INTEGER, TINYINT, SMALLINT, BIGINT), the existing logic of checking * cardinality and cartesian product remains unchanged. */ if(TypeUtils.isNumeric(columnRField)) { switch(sqlTypeName) { case DECIMAL: case FLOAT: case REAL: case DOUBLE: result = false; break; default: result = true; break; } } return result; } }) .toList(); final List<ColumnStats> measure = FluentIterable .from(Lists.reverse(candidates)) .filter(new Predicate<ColumnStats>() { @Override public boolean apply(final ColumnStats columnStats) { return MEASURE_TYPE_FILTER.apply(columnStats.getField()); } }) .limit(measureLimit) .toList(); // generate aggregation suggestions Optional<AggregationDescriptor> aggregation = generate(dimension, measure, analysisSummary.getCount()); if (aggregation.isPresent()) { return FluentIterable .from(Arrays.asList(aggregation.get())) .transform(new Function<AggregationDescriptor, ReflectionDetails>() { @Override public ReflectionDetails apply(final AggregationDescriptor input) { return new ReflectionDetails() .setDimensionFieldList(toReflectionDimensionFields(input.getDimensions())) .setMeasureFieldList(input.getMeasures().stream().map( stats -> new ReflectionMeasureField(stats.getField().getName()).setMeasureTypeList(ReflectionValidator.getDefaultMeasures(stats.getField().getTypeFamily())) ).collect(Collectors.toList())); } }) .toList(); } return Collections.emptyList(); } /** * Suggests raw reflection. * <p> * Current implementation is simply a pass through. * */ private List<ReflectionDetails> getRawReflections() { return ImmutableList.of(new ReflectionDetails() .setDisplayFieldList( Lists.transform( columnStats, new Function<ColumnStats, ReflectionField>() { @Override public ReflectionField apply(ColumnStats column) { return new ReflectionField(column.getField().getName()); } } ) )); } /** * Generates a single aggregation. * <p> * This algorithm is heuristic based so there no guarantees as to find the optimal solution. The resulting plan * should satisfy the following: * <p> * (1) there is at least a dimension column * (2) cardinality of each dimension should be less than square root of max cardinality(except if there is only one dimension) * (3) cartesian product should be less than 2 Billion (assuming 500 bytes for each row, this is 1 TB) */ protected Optional<AggregationDescriptor> generate(final List<ColumnStats> dimensions, final List<ColumnStats> measures, Long count) { final List<ColumnStats> dimensionFields = Lists.newArrayList(); if (!dimensions.isEmpty()) { //add the first one anyway dimensionFields.add(dimensions.get(0)); long currentCardinalityProduct = dimensions.get(0).getCardinality(); double cardinalityLimit = (Optional.fromNullable(count).or(100_000L)) * .01; for (int i = 1; i < dimensions.size(); i++) { final ColumnStats field = dimensions.get(i); long newCardinalityProduct = currentCardinalityProduct * field.getCardinality(); if (field.getCardinality() <= cardinalityLimit && newCardinalityProduct <= CARTESIAN_CARDINALITY_UPPER_LIMIT) { dimensionFields.add(field); currentCardinalityProduct = newCardinalityProduct; } } final List<ColumnStats> measureFields = FluentIterable .from(measures) .filter(new Predicate<ColumnStats>() { @Override public boolean apply(final ColumnStats columnStats) { return !dimensionFields.contains(columnStats); } }).toList(); return Optional.of(new AggregationDescriptor(ImmutableList.copyOf(dimensionFields), measureFields)); } return Optional.absent(); } private static List<ReflectionDimensionField> toReflectionDimensionFields(final Iterable<ColumnStats> columns) { return FluentIterable.from(columns) .transform(new Function<ColumnStats, ReflectionDimensionField>() { @Nullable @Override public ReflectionDimensionField apply(final ColumnStats columnStats) { return new ReflectionDimensionField() .setName(columnStats.getField().getName()) .setGranularity(DimensionGranularity.DATE); } }) .toList(); } static List<ReflectionField> toReflectionFields(final Iterable<ColumnStats> columns) { return FluentIterable.from(columns) .transform(TO_REFLECTION_FIELD) .toList(); } // utility classes /** * Returns a truncated view root that is the bottom part of the plan after splitting input view from aggregation. */ protected static class AggregationDescriptor { private final List<ColumnStats> dimensions; private final List<ColumnStats> measures; public AggregationDescriptor(final List<ColumnStats> dimensions, final List<ColumnStats> measures) { this.dimensions = Preconditions.checkNotNull(dimensions, "dimensions are required"); this.measures = Preconditions.checkNotNull(measures, "measures are required"); Preconditions.checkArgument(!dimensions.isEmpty(), "dimensions cannot be empty"); } public List<ColumnStats> getMeasures() { return measures; } public List<ColumnStats> getDimensions() { return dimensions; } } /** * Summary of dataset analysis */ private static final class AnalysisSummary { private final List<ColumnStats> analysis; private final long count; public AnalysisSummary(final List<ColumnStats> analysis, final long count) { this.analysis = analysis; this.count = count; } public List<ColumnStats> getAnalysis() { return analysis; } public long getCount() { return count; } public static AnalysisSummary of(final List<ColumnStats> columnList, Long count) { final List<ColumnStats> columns = AccelerationUtils.selfOrEmpty(columnList); if (columns.isEmpty()) { return new AnalysisSummary(columnList, count); } return new AnalysisSummary(columnList, count); } } }