/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.datasketches.hive.theta; import static org.apache.datasketches.Util.DEFAULT_NOMINAL_ENTRIES; import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED; import java.util.Arrays; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; @Description( name = "dataToSketch", value = "_FUNC_(expr, size, prob, seed) - " + "Compute a sketch of given size, sampling probability and seed on data 'expr'", extended = "Example:\n" + "> SELECT dataToSketch(val, 16384) FROM src;\n" + "The return value is a binary blob that can be operated on by other sketch related functions." + " The sketch size is optional, must be a power of 2 and " + "controls the relative error expected from the sketch." + " A size of 16384 can be expected to yield errors of roughly +-1.5% in the estimation of uniques." + " The default size is defined in the sketches-core library " + "and at the time of this writing was 4096 (about 3% error)." + " The sampling probability is optional and must be from 0 to 1. The default is 1 (no sampling)" + " The seed is optional, and using it is not recommended unless you really know why you need it") @SuppressWarnings("javadoc") public class DataToSketchUDAF extends AbstractGenericUDAFResolver { /** * Performs argument number and type validation. DataToSketch expects * to receive between one and four arguments. * <ul> * <li>The first (required) is the value to add to the sketch and must be a primitive.</li> * * <li>The second (optional) is the sketch size to use. This must be an integral value * and must be constant.</li> * * <li>The third (optional) is the sampling probability and is a floating point value between * 0.0 and 1.0. It must be a constant</li> * * <li>The fourth (optional) is an update seed. * It must be an integral value and must be constant.</li> * </ul> * * @see org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver * #getEvaluator(org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo) * * @param info Parameter info to validate * @return The GenericUDAFEvaluator that should be used to calculate the function. */ @Override public GenericUDAFEvaluator getEvaluator(final GenericUDAFParameterInfo info) throws SemanticException { final ObjectInspector[] parameters = info.getParameterObjectInspectors(); // Validate the correct number of parameters if (parameters.length < 1) { throw new UDFArgumentException("Please specify at least 1 argument"); } if (parameters.length > 4) { throw new UDFArgumentException("Please specify no more than 4 arguments"); } // Validate first parameter type ObjectInspectorValidator.validateCategoryPrimitive(parameters[0], 0); // Validate second argument if present if (parameters.length > 1) { ObjectInspectorValidator.validateIntegralParameter(parameters[1], 1); if (!ObjectInspectorUtils.isConstantObjectInspector(parameters[1])) { throw new UDFArgumentTypeException(1, "The second argument must be a constant"); } } // Validate third argument if present if (parameters.length > 2) { ObjectInspectorValidator.validateFloatingPointParameter(parameters[2], 2); if (!ObjectInspectorUtils.isConstantObjectInspector(parameters[2])) { throw new UDFArgumentTypeException(2, "The third argument must be a constant"); } } // Validate fourth argument if present if (parameters.length > 3) { ObjectInspectorValidator.validateIntegralParameter(parameters[3], 3); if (!ObjectInspectorUtils.isConstantObjectInspector(parameters[3])) { throw new UDFArgumentTypeException(3, "The fourth argument must be a constant"); } } return new DataToSketchEvaluator(); } public static class DataToSketchEvaluator extends UnionEvaluator { // FOR PARTIAL1 and COMPLETE modes: ObjectInspectors for original data private transient PrimitiveObjectInspector samplingProbabilityObjectInspector; /* * (non-Javadoc) * * @see * org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator#init(org.apache * .hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode, * org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector[]) */ @Override public ObjectInspector init(final Mode mode, final ObjectInspector[] parameters) throws HiveException { super.init(mode, parameters); if ((mode == Mode.PARTIAL1) || (mode == Mode.COMPLETE)) { // input is original data inputObjectInspector = (PrimitiveObjectInspector) parameters[0]; if (parameters.length > 1) { nominalEntriesObjectInspector = (PrimitiveObjectInspector) parameters[1]; } if (parameters.length > 2) { samplingProbabilityObjectInspector = (PrimitiveObjectInspector) parameters[2]; } if (parameters.length > 3) { seedObjectInspector = (PrimitiveObjectInspector) parameters[3]; } } else { // input for PARTIAL2 and FINAL is the output from PARTIAL1 intermediateObjectInspector = (StructObjectInspector) parameters[0]; } if ((mode == Mode.PARTIAL1) || (mode == Mode.PARTIAL2)) { // intermediate results need to include the the nominal number of entries and the seed return ObjectInspectorFactory.getStandardStructObjectInspector( Arrays.asList(NOMINAL_ENTRIES_FIELD, SEED_FIELD, SKETCH_FIELD), Arrays.asList( PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.INT), PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.LONG), PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.BINARY) ) ); } // final results include just the sketch return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(PrimitiveCategory.BINARY); } /* * (non-Javadoc) * * @see * org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator#iterate(org * .apache * .hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer, * java.lang.Object[]) */ @Override public void iterate(final @SuppressWarnings("deprecation") AggregationBuffer agg, final Object[] parameters) throws HiveException { if (parameters[0] == null) { return; } final UnionState state = (UnionState) agg; if (!state.isInitialized()) { initializeState(state, parameters); } state.update(parameters[0], inputObjectInspector); } private void initializeState(final UnionState state, final Object[] parameters) { int sketchSize = DEFAULT_NOMINAL_ENTRIES; if (nominalEntriesObjectInspector != null) { sketchSize = PrimitiveObjectInspectorUtils.getInt(parameters[1], nominalEntriesObjectInspector); } float samplingProbability = UnionState.DEFAULT_SAMPLING_PROBABILITY; if (samplingProbabilityObjectInspector != null) { samplingProbability = PrimitiveObjectInspectorUtils.getFloat(parameters[2], samplingProbabilityObjectInspector); } long seed = DEFAULT_UPDATE_SEED; if (seedObjectInspector != null) { seed = PrimitiveObjectInspectorUtils.getLong(parameters[3], seedObjectInspector); } state.init(sketchSize, samplingProbability, seed); } } }