Java Code Examples for org.apache.kylin.cube.CubeSegment#getConfig()

The following examples show how to use org.apache.kylin.cube.CubeSegment#getConfig() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CuboidStatsReaderUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static Pair<Map<Long, Long>, Long> readCuboidStatsWithSourceFromSegment(Set<Long> cuboidIds,
        CubeSegment cubeSegment) throws IOException {
    if (cubeSegment == null) {
        logger.warn("The cube segment can not be " + null);
        return null;
    }

    CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, null, cubeSegment.getConfig());
    if (cubeStatsReader.getCuboidRowEstimatesHLL() == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().isEmpty()) {
        logger.info("Cuboid Statistics is not enabled.");
        return null;
    }

    Map<Long, Long> cuboidsWithStatsAll = cubeStatsReader.getCuboidRowEstimatesHLL();
    Map<Long, Long> cuboidsWithStats = Maps.newHashMapWithExpectedSize(cuboidIds.size());
    for (Long cuboid : cuboidIds) {
        Long rowEstimate = cuboidsWithStatsAll.get(cuboid);
        if (rowEstimate == null) {
            logger.warn("Cannot get the row count stats for cuboid " + cuboid);
        } else {
            cuboidsWithStats.put(cuboid, rowEstimate);
        }
    }
    return new Pair<>(cuboidsWithStats, cubeStatsReader.sourceRowCount);
}
 
Example 2
Source File: MapReduceUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static Pair<Integer, Integer> getConvergeCuboidDataReduceTaskNums(CubeSegment cubeSeg) throws IOException {
    long baseCuboidId = cubeSeg.getCuboidScheduler().getBaseCuboidId();

    Set<Long> overlapCuboids = Sets.newHashSet(cubeSeg.getCuboidScheduler().getAllCuboidIds());
    overlapCuboids.retainAll(cubeSeg.getCubeInstance().getCuboidsRecommend());
    overlapCuboids.add(baseCuboidId);

    Pair<Map<Long, Long>, Long> cuboidStats = CuboidStatsReaderUtil
            .readCuboidStatsWithSourceFromSegment(overlapCuboids, cubeSeg);
    Map<Long, Double> cubeSizeMap = CubeStatsReader.getCuboidSizeMapFromRowCount(cubeSeg, cuboidStats.getFirst(),
            cuboidStats.getSecond());
    double totalSizeInM = 0;
    for (Double cuboidSize : cubeSizeMap.values()) {
        totalSizeInM += cuboidSize;
    }

    double baseSizeInM = cubeSizeMap.get(baseCuboidId);

    KylinConfig kylinConfig = cubeSeg.getConfig();
    int nBase = getReduceTaskNum(baseSizeInM, kylinConfig);
    int nOther = getReduceTaskNum(totalSizeInM - baseSizeInM, kylinConfig);
    return new Pair<>(nBase + nOther, nBase);
}
 
Example 3
Source File: CuboidStatsReaderUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static Pair<Map<Long, Long>, Long> readCuboidStatsWithSourceFromSegment(Set<Long> cuboidIds,
        CubeSegment cubeSegment) throws IOException {
    if (cubeSegment == null) {
        logger.warn("The cube segment can not be " + null);
        return null;
    }

    CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, null, cubeSegment.getConfig());
    if (cubeStatsReader.getCuboidRowEstimatesHLL() == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().isEmpty()) {
        logger.info("Cuboid Statistics is not enabled.");
        return null;
    }

    Map<Long, Long> cuboidsWithStatsAll = cubeStatsReader.getCuboidRowEstimatesHLL();
    Map<Long, Long> cuboidsWithStats = Maps.newHashMapWithExpectedSize(cuboidIds.size());
    for (Long cuboid : cuboidIds) {
        Long rowEstimate = cuboidsWithStatsAll.get(cuboid);
        if (rowEstimate == null) {
            logger.warn("Cannot get the row count stats for cuboid " + cuboid);
        } else {
            cuboidsWithStats.put(cuboid, rowEstimate);
        }
    }
    return new Pair<>(cuboidsWithStats, cubeStatsReader.sourceRowCount);
}
 
Example 4
Source File: CuboidRecommenderUtil.java    From kylin with Apache License 2.0 5 votes vote down vote up
/** For future segment level recommend */
public static Map<Long, Long> getRecommendCuboidList(CubeSegment segment, Map<Long, Long> hitFrequencyMap,
        Map<Long, Map<Long, Pair<Long, Long>>> rollingUpCountSourceMap, boolean ifForceRecommend)
        throws IOException {
    if (segment == null) {
        return null;
    }

    CubeStatsReader cubeStatsReader = new CubeStatsReader(segment, null, segment.getConfig());
    if (cubeStatsReader.getCuboidRowEstimatesHLL() == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().isEmpty()) {
        logger.info("Cuboid Statistics is not enabled.");
        return null;
    }
    CubeInstance cube = segment.getCubeInstance();
    long baseCuboid = cube.getCuboidScheduler().getBaseCuboidId();
    if (cubeStatsReader.getCuboidRowEstimatesHLL().get(baseCuboid) == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().get(baseCuboid) == 0L) {
        logger.info(BASE_CUBOID_COUNT_IN_CUBOID_STATISTICS_IS_ZERO);
        return null;
    }

    String key = cube.getName() + "-" + segment.getName();
    CuboidStats cuboidStats = new CuboidStats.Builder(key, baseCuboid, cubeStatsReader.getCuboidRowEstimatesHLL(),
            cubeStatsReader.getCuboidSizeMap()).setHitFrequencyMap(hitFrequencyMap)
                    .setRollingUpCountSourceMap(rollingUpCountSourceMap).build();
    return CuboidRecommender.getInstance().getRecommendCuboidList(cuboidStats, segment.getConfig(),
            ifForceRecommend);
}
 
Example 5
Source File: JobBuilderSupport.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public JobBuilderSupport(CubeSegment seg, String submitter, Integer priorityOffset) {
    Preconditions.checkNotNull(seg, "segment cannot be null");
    this.config = new JobEngineConfig(seg.getConfig());
    this.seg = seg;
    this.submitter = submitter;
    this.priorityOffset = priorityOffset;
}
 
Example 6
Source File: MapReduceUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public static int getInmemCubingReduceTaskNum(CubeSegment cubeSeg, CuboidScheduler cuboidScheduler)
        throws IOException {
    KylinConfig kylinConfig = cubeSeg.getConfig();

    Map<Long, Double> cubeSizeMap = new CubeStatsReader(cubeSeg, cuboidScheduler, kylinConfig).getCuboidSizeMap();
    double totalSizeInM = 0;
    for (Double cuboidSize : cubeSizeMap.values()) {
        totalSizeInM += cuboidSize;
    }
    return getReduceTaskNum(totalSizeInM, kylinConfig);
}
 
Example 7
Source File: CuboidRecommenderUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
/** Trigger cube planner phase one */
public static Map<Long, Long> getRecommendCuboidList(CubeSegment segment) throws IOException {
    if (segment == null) {
        return null;
    }

    CubeStatsReader cubeStatsReader = new CubeStatsReader(segment, null, segment.getConfig());
    if (cubeStatsReader.getCuboidRowEstimatesHLL() == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().isEmpty()) {
        logger.info("Cuboid Statistics is not enabled.");
        return null;
    }
    CubeInstance cube = segment.getCubeInstance();
    long baseCuboid = cube.getCuboidScheduler().getBaseCuboidId();
    if (cubeStatsReader.getCuboidRowEstimatesHLL().get(baseCuboid) == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().get(baseCuboid) == 0L) {
        logger.info(BASE_CUBOID_COUNT_IN_CUBOID_STATISTICS_IS_ZERO);
        return null;
    }

    Set<Long> mandatoryCuboids = segment.getCubeDesc().getMandatoryCuboids();

    String key = cube.getName();
    CuboidStats cuboidStats = new CuboidStats.Builder(key, baseCuboid, cubeStatsReader.getCuboidRowEstimatesHLL(),
            cubeStatsReader.getCuboidSizeMap()).setMandatoryCuboids(mandatoryCuboids).setBPUSMinBenefitRatio(segment.getConfig().getCubePlannerBPUSMinBenefitRatio()).build();
    return CuboidRecommender.getInstance().getRecommendCuboidList(cuboidStats, segment.getConfig(),
            !mandatoryCuboids.isEmpty());
}
 
Example 8
Source File: CuboidRecommenderUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
/** For future segment level recommend */
public static Map<Long, Long> getRecommendCuboidList(CubeSegment segment, Map<Long, Long> hitFrequencyMap,
        Map<Long, Map<Long, Pair<Long, Long>>> rollingUpCountSourceMap, boolean ifForceRecommend)
        throws IOException {
    if (segment == null) {
        return null;
    }

    CubeStatsReader cubeStatsReader = new CubeStatsReader(segment, null, segment.getConfig());
    if (cubeStatsReader.getCuboidRowEstimatesHLL() == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().isEmpty()) {
        logger.info("Cuboid Statistics is not enabled.");
        return null;
    }
    CubeInstance cube = segment.getCubeInstance();
    long baseCuboid = cube.getCuboidScheduler().getBaseCuboidId();
    if (cubeStatsReader.getCuboidRowEstimatesHLL().get(baseCuboid) == null
            || cubeStatsReader.getCuboidRowEstimatesHLL().get(baseCuboid) == 0L) {
        logger.info(BASE_CUBOID_COUNT_IN_CUBOID_STATISTICS_IS_ZERO);
        return null;
    }

    String key = cube.getName() + "-" + segment.getName();
    CuboidStats cuboidStats = new CuboidStats.Builder(key, baseCuboid, cubeStatsReader.getCuboidRowEstimatesHLL(),
            cubeStatsReader.getCuboidSizeMap()).setHitFrequencyMap(hitFrequencyMap)
                    .setRollingUpCountSourceMap(rollingUpCountSourceMap).build();
    return CuboidRecommender.getInstance().getRecommendCuboidList(cuboidStats, segment.getConfig(),
            ifForceRecommend);
}
 
Example 9
Source File: KafkaInputBase.java    From kylin with Apache License 2.0 5 votes vote down vote up
public BaseBatchCubingInputSide(CubeSegment seg, IJoinedFlatTableDesc flatDesc) {
    this.conf = new JobEngineConfig(KylinConfig.getInstanceFromEnv());
    this.config = seg.getConfig();
    this.flatDesc = flatDesc;
    this.hiveTableDatabase = config.getHiveDatabaseForIntermediateTable();
    this.seg = seg;
    this.cubeDesc = seg.getCubeDesc();
    this.cubeName = seg.getCubeInstance().getName();
}
 
Example 10
Source File: KafkaInputBase.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public BaseBatchCubingInputSide(CubeSegment seg, IJoinedFlatTableDesc flatDesc) {
    this.conf = new JobEngineConfig(KylinConfig.getInstanceFromEnv());
    this.config = seg.getConfig();
    this.flatDesc = flatDesc;
    this.hiveTableDatabase = config.getHiveDatabaseForIntermediateTable();
    this.seg = seg;
    this.cubeDesc = seg.getCubeDesc();
    this.cubeName = seg.getCubeInstance().getName();
}
 
Example 11
Source File: MapReduceUtil.java    From kylin with Apache License 2.0 5 votes vote down vote up
public static int getInmemCubingReduceTaskNum(CubeSegment cubeSeg, CuboidScheduler cuboidScheduler)
        throws IOException {
    KylinConfig kylinConfig = cubeSeg.getConfig();

    Map<Long, Double> cubeSizeMap = new CubeStatsReader(cubeSeg, cuboidScheduler, kylinConfig).getCuboidSizeMap();
    double totalSizeInM = 0;
    for (Double cuboidSize : cubeSizeMap.values()) {
        totalSizeInM += cuboidSize;
    }
    return getReduceTaskNum(totalSizeInM, kylinConfig);
}
 
Example 12
Source File: JobBuilderSupport.java    From kylin with Apache License 2.0 5 votes vote down vote up
public JobBuilderSupport(CubeSegment seg, String submitter, Integer priorityOffset) {
    Preconditions.checkNotNull(seg, "segment cannot be null");
    this.config = new JobEngineConfig(seg.getConfig());
    this.seg = seg;
    this.submitter = submitter;
    this.priorityOffset = priorityOffset;
}
 
Example 13
Source File: StatisticsDecisionUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
public static void decideCubingAlgorithm(CubingJob cubingJob, CubeSegment seg, double mapperOverlapRatio,
        int mapperNumber) throws IOException {
    KylinConfig kylinConf = seg.getConfig();
    String algPref = kylinConf.getCubeAlgorithm();
    CubingJob.AlgorithmEnum alg;
    if (mapperOverlapRatio == 0 && kylinConf.isAutoInmemToOptimize()) { // no source records
        alg = CubingJob.AlgorithmEnum.INMEM;
    } else if (CubingJob.AlgorithmEnum.INMEM.name().equalsIgnoreCase(algPref)) {
        alg = CubingJob.AlgorithmEnum.INMEM;
        if (seg.getCubeDesc().isStreamingCube() && CubingJob.CubingJobTypeEnum
                .getByName(cubingJob.getJobType()) == CubingJob.CubingJobTypeEnum.BUILD) {
            alg = CubingJob.AlgorithmEnum.LAYER;
        }
    } else if (CubingJob.AlgorithmEnum.LAYER.name().equalsIgnoreCase(algPref)) {
        alg = CubingJob.AlgorithmEnum.LAYER;
    } else {
        int memoryHungryMeasures = 0;
        for (MeasureDesc measure : seg.getCubeDesc().getMeasures()) {
            if (measure.getFunction().getMeasureType().isMemoryHungry()) {
                logger.info("This cube has memory-hungry measure " + measure.getFunction().getExpression());
                memoryHungryMeasures++;
            }
        }

        if (memoryHungryMeasures > 0) {
            alg = CubingJob.AlgorithmEnum.LAYER;
        } else if ("random".equalsIgnoreCase(algPref)) { // for testing
            alg = new Random().nextBoolean() ? CubingJob.AlgorithmEnum.INMEM : CubingJob.AlgorithmEnum.LAYER;
        } else { // the default
            int mapperNumLimit = kylinConf.getCubeAlgorithmAutoMapperLimit();
            double overlapThreshold = kylinConf.getCubeAlgorithmAutoThreshold();
            logger.info("mapperNumber for " + seg + " is " + mapperNumber + " and threshold is " + mapperNumLimit);
            logger.info("mapperOverlapRatio for " + seg + " is " + mapperOverlapRatio + " and threshold is "
                    + overlapThreshold);

            // in-mem cubing is good when
            // 1) the cluster has enough mapper slots to run in parallel
            // 2) the mapper overlap ratio is small, meaning the shuffle of in-mem MR has advantage
            alg = (mapperNumber <= mapperNumLimit && mapperOverlapRatio <= overlapThreshold)//
                    ? CubingJob.AlgorithmEnum.INMEM
                    : CubingJob.AlgorithmEnum.LAYER;
        }

    }
    logger.info("The cube algorithm for " + seg + " is " + alg);

    cubingJob.setAlgorithm(alg);
}
 
Example 14
Source File: CubeStatsReader.java    From kylin with Apache License 2.0 4 votes vote down vote up
/**
 * Estimate the cuboid's size
 *
 * @return the cuboid size in M bytes
 */
private static double estimateCuboidStorageSize(CubeSegment cubeSegment, long cuboidId, long rowCount,
        long baseCuboidId, long baseCuboidCount, List<Integer> rowKeyColumnLength, long sourceRowCount) {

    int rowkeyLength = cubeSegment.getRowKeyPreambleSize();
    KylinConfig kylinConf = cubeSegment.getConfig();

    long mask = Long.highestOneBit(baseCuboidId);
    long parentCuboidIdActualLength = (long) Long.SIZE - Long.numberOfLeadingZeros(baseCuboidId);
    for (int i = 0; i < parentCuboidIdActualLength; i++) {
        if ((mask & cuboidId) > 0) {
            rowkeyLength += rowKeyColumnLength.get(i); //colIO.getColumnLength(columnList.get(i));
        }
        mask = mask >> 1;
    }

    // add the measure length
    int normalSpace = rowkeyLength;
    int countDistinctSpace = 0;
    double percentileSpace = 0;
    int topNSpace = 0;
    for (MeasureDesc measureDesc : cubeSegment.getCubeDesc().getMeasures()) {
        if (rowCount == 0)
            break;
        DataType returnType = measureDesc.getFunction().getReturnDataType();
        if (measureDesc.getFunction().getExpression().equals(FunctionDesc.FUNC_COUNT_DISTINCT)) {
            long estimateDistinctCount = sourceRowCount / rowCount;
            estimateDistinctCount = estimateDistinctCount == 0 ? 1L : estimateDistinctCount;
            countDistinctSpace += returnType.getStorageBytesEstimate(estimateDistinctCount);
        } else if (measureDesc.getFunction().getExpression().equals(FunctionDesc.FUNC_PERCENTILE)) {
            percentileSpace += returnType.getStorageBytesEstimate(baseCuboidCount * 1.0 / rowCount);
        } else if (measureDesc.getFunction().getExpression().equals(TopNMeasureType.FUNC_TOP_N)) {
            long estimateTopNCount = sourceRowCount / rowCount;
            estimateTopNCount = estimateTopNCount == 0 ? 1L : estimateTopNCount;
            topNSpace += returnType.getStorageBytesEstimate(estimateTopNCount);
        } else {
            normalSpace += returnType.getStorageBytesEstimate();
        }
    }

    double cuboidSizeRatio = kylinConf.getJobCuboidSizeRatio();
    double cuboidSizeMemHungryRatio = kylinConf.getJobCuboidSizeCountDistinctRatio();
    double cuboidSizeTopNRatio = kylinConf.getJobCuboidSizeTopNRatio();

    double ret = (1.0 * normalSpace * rowCount * cuboidSizeRatio
            + 1.0 * countDistinctSpace * rowCount * cuboidSizeMemHungryRatio + 1.0 * percentileSpace * rowCount
            + 1.0 * topNSpace * rowCount * cuboidSizeTopNRatio) / (1024L * 1024L);
    return ret;
}
 
Example 15
Source File: StatisticsDecisionUtil.java    From kylin with Apache License 2.0 4 votes vote down vote up
public static void decideCubingAlgorithm(CubingJob cubingJob, CubeSegment seg) throws IOException {
    CubeStatsReader cubeStats = new CubeStatsReader(seg, null, seg.getConfig());
    decideCubingAlgorithm(cubingJob, seg, cubeStats.getMapperOverlapRatioOfFirstBuild(),
            cubeStats.getMapperNumberOfFirstBuild());
}
 
Example 16
Source File: SaveStatisticsStep.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    CubeSegment newSegment = CubingExecutableUtil.findSegment(context,
            CubingExecutableUtil.getCubeName(this.getParams()),
            CubingExecutableUtil.getSegmentId(this.getParams()));
    KylinConfig kylinConf = newSegment.getConfig();

    ResourceStore rs = ResourceStore.getStore(kylinConf);
    try {

        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        Configuration hadoopConf = HadoopUtil.getCurrentConfiguration();
        Path statisticsDir = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        Path[] statisticsFiles = HadoopUtil.getFilteredPath(fs, statisticsDir,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDir);
        }

        Map<Long, HLLCounter> cuboidHLLMap = Maps.newHashMap();
        long totalRowsBeforeMerge = 0;
        long grantTotal = 0;
        int samplingPercentage = -1;
        int mapperNumber = -1;
        for (Path item : statisticsFiles) {
            CubeStatsReader.CubeStatsResult cubeStatsResult = new CubeStatsReader.CubeStatsResult(item,
                    kylinConf.getCubeStatsHLLPrecision());
            cuboidHLLMap.putAll(cubeStatsResult.getCounterMap());
            long pGrantTotal = 0L;
            for (HLLCounter hll : cubeStatsResult.getCounterMap().values()) {
                pGrantTotal += hll.getCountEstimate();
            }
            totalRowsBeforeMerge += pGrantTotal * cubeStatsResult.getMapperOverlapRatio();
            grantTotal += pGrantTotal;
            int pMapperNumber = cubeStatsResult.getMapperNumber();
            if (pMapperNumber > 0) {
                if (mapperNumber < 0) {
                    mapperNumber = pMapperNumber;
                } else {
                    throw new RuntimeException(
                            "Base cuboid has been distributed to multiple reducers at step FactDistinctColumnsReducer!!!");
                }
            }
            int pSamplingPercentage = cubeStatsResult.getPercentage();
            if (samplingPercentage < 0) {
                samplingPercentage = pSamplingPercentage;
            } else if (samplingPercentage != pSamplingPercentage) {
                throw new RuntimeException(
                        "The sampling percentage should be same among all of the reducer of FactDistinctColumnsReducer!!!");
            }
        }
        if (samplingPercentage < 0) {
            logger.warn("The sampling percentage should be set!!!");
        }
        if (mapperNumber < 0) {
            logger.warn("The mapper number should be set!!!");
        }

        if (logger.isDebugEnabled()) {
            logMapperAndCuboidStatistics(cuboidHLLMap, samplingPercentage, mapperNumber, grantTotal,
                    totalRowsBeforeMerge);
        }
        double mapperOverlapRatio = grantTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grantTotal;
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        long sourceRecordCount = cubingJob.findSourceRecordCount();
        CubeStatsWriter.writeCuboidStatistics(hadoopConf, statisticsDir, cuboidHLLMap, samplingPercentage,
                mapperNumber, mapperOverlapRatio, sourceRecordCount);

        Path statisticsFile = new Path(statisticsDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME);
        logger.info("{} stats saved to hdfs {}", newSegment, statisticsFile);

        FSDataInputStream is = fs.open(statisticsFile);
        try {
            // put the statistics to metadata store
            String resPath = newSegment.getStatisticsResourcePath();
            rs.putResource(resPath, is, System.currentTimeMillis());
            logger.info("{} stats saved to resource {}", newSegment, resPath);

            StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, newSegment);
            StatisticsDecisionUtil.optimizeCubingPlan(newSegment);
        } finally {
            IOUtils.closeStream(is);
        }

        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to save cuboid statistics", e);
        return ExecuteResult.createError(e);
    }
}
 
Example 17
Source File: MergeStatisticsWithOldStep.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager mgr = CubeManager.getInstance(context.getConfig());
    final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment optimizeSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));

    CubeSegment oldSegment = optimizeSegment.getCubeInstance().getOriginalSegmentToOptimize(optimizeSegment);
    Preconditions.checkNotNull(oldSegment,
            "cannot find the original segment to be optimized by " + optimizeSegment);

    KylinConfig kylinConf = cube.getConfig();
    Configuration conf = HadoopUtil.getCurrentConfiguration();
    ResourceStore rs = ResourceStore.getStore(kylinConf);
    int averageSamplingPercentage = 0;

    try {
        //1. Add statistics from optimized segment
        Path statisticsDirPath = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        FileSystem hdfs = FileSystem.get(conf);
        if (!hdfs.exists(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " does not exists");
        }

        if (!hdfs.isDirectory(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " is not a directory");
        }

        Path[] statisticsFiles = HadoopUtil.getFilteredPath(hdfs, statisticsDirPath,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDirPath);
        }

        for (Path item : statisticsFiles) {
            CubeStatsReader optimizeSegmentStatsReader = new CubeStatsReader(optimizeSegment, null,
                    optimizeSegment.getConfig(), item);
            averageSamplingPercentage += optimizeSegmentStatsReader.getSamplingPercentage();
            addFromCubeStatsReader(optimizeSegmentStatsReader);
        }

        //2. Add statistics from old segment
        CubeStatsReader oldSegmentStatsReader = new CubeStatsReader(oldSegment, null, oldSegment.getConfig());
        averageSamplingPercentage += oldSegmentStatsReader.getSamplingPercentage();
        addFromCubeStatsReader(oldSegmentStatsReader);

        logger.info("Cuboid set with stats info: " + cuboidHLLMap.keySet().toString());
        //3. Store merged statistics for recommend cuboids
        averageSamplingPercentage = averageSamplingPercentage / 2;
        Set<Long> cuboidsRecommend = cube.getCuboidsRecommend();

        Map<Long, HLLCounter> resultCuboidHLLMap = Maps.newHashMapWithExpectedSize(cuboidsRecommend.size());
        for (Long cuboid : cuboidsRecommend) {
            HLLCounter hll = cuboidHLLMap.get(cuboid);
            if (hll == null) {
                logger.warn("Cannot get the row count stats for cuboid " + cuboid);
            } else {
                resultCuboidHLLMap.put(cuboid, hll);
            }
        }

        String resultDir = CubingExecutableUtil.getMergedStatisticsPath(this.getParams());
        CubeStatsWriter.writeCuboidStatistics(conf, new Path(resultDir), resultCuboidHLLMap,
                averageSamplingPercentage, oldSegmentStatsReader.getSourceRowCount());

        try (FSDataInputStream mergedStats = hdfs
                .open(new Path(resultDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME))) {
            // put the statistics to metadata store
            String statisticsFileName = optimizeSegment.getStatisticsResourcePath();
            rs.putResource(statisticsFileName, mergedStats, System.currentTimeMillis());
        }

        //By default, the cube optimization will use in-memory cubing
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, optimizeSegment);

        return new ExecuteResult();
    } catch (IOException e) {
        logger.error("fail to merge cuboid statistics", e);
        return ExecuteResult.createError(e);
    }

}
 
Example 18
Source File: MergeStatisticsWithOldStep.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager mgr = CubeManager.getInstance(context.getConfig());
    final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment optimizeSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));

    CubeSegment oldSegment = optimizeSegment.getCubeInstance().getOriginalSegmentToOptimize(optimizeSegment);
    Preconditions.checkNotNull(oldSegment,
            "cannot find the original segment to be optimized by " + optimizeSegment);

    KylinConfig kylinConf = cube.getConfig();
    Configuration conf = HadoopUtil.getCurrentConfiguration();
    ResourceStore rs = ResourceStore.getStore(kylinConf);
    int averageSamplingPercentage = 0;

    try {
        //1. Add statistics from optimized segment
        Path statisticsDirPath = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        FileSystem hdfs = FileSystem.get(conf);
        if (!hdfs.exists(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " does not exists");
        }

        if (!hdfs.isDirectory(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " is not a directory");
        }

        Path[] statisticsFiles = HadoopUtil.getFilteredPath(hdfs, statisticsDirPath,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDirPath);
        }

        for (Path item : statisticsFiles) {
            CubeStatsReader optimizeSegmentStatsReader = new CubeStatsReader(optimizeSegment, null,
                    optimizeSegment.getConfig(), item);
            averageSamplingPercentage += optimizeSegmentStatsReader.getSamplingPercentage();
            addFromCubeStatsReader(optimizeSegmentStatsReader);
        }

        //2. Add statistics from old segment
        CubeStatsReader oldSegmentStatsReader = new CubeStatsReader(oldSegment, null, oldSegment.getConfig());
        averageSamplingPercentage += oldSegmentStatsReader.getSamplingPercentage();
        addFromCubeStatsReader(oldSegmentStatsReader);

        logger.info("Cuboid set with stats info: " + cuboidHLLMap.keySet().toString());
        //3. Store merged statistics for recommend cuboids
        averageSamplingPercentage = averageSamplingPercentage / 2;
        Set<Long> cuboidsRecommend = cube.getCuboidsRecommend();

        Map<Long, HLLCounter> resultCuboidHLLMap = Maps.newHashMapWithExpectedSize(cuboidsRecommend.size());
        for (Long cuboid : cuboidsRecommend) {
            HLLCounter hll = cuboidHLLMap.get(cuboid);
            if (hll == null) {
                logger.warn("Cannot get the row count stats for cuboid " + cuboid);
            } else {
                resultCuboidHLLMap.put(cuboid, hll);
            }
        }

        String resultDir = CubingExecutableUtil.getMergedStatisticsPath(this.getParams());
        CubeStatsWriter.writeCuboidStatistics(conf, new Path(resultDir), resultCuboidHLLMap,
                averageSamplingPercentage, oldSegmentStatsReader.getSourceRowCount());

        try (FSDataInputStream mergedStats = hdfs
                .open(new Path(resultDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME))) {
            // put the statistics to metadata store
            String statisticsFileName = optimizeSegment.getStatisticsResourcePath();
            rs.putResource(statisticsFileName, mergedStats, System.currentTimeMillis());
        }

        //By default, the cube optimization will use in-memory cubing
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, optimizeSegment);

        return new ExecuteResult();
    } catch (IOException e) {
        logger.error("fail to merge cuboid statistics", e);
        return ExecuteResult.createError(e);
    }

}
 
Example 19
Source File: CubeStatsReader.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
/**
 * Estimate the cuboid's size
 *
 * @return the cuboid size in M bytes
 */
private static double estimateCuboidStorageSize(CubeSegment cubeSegment, long cuboidId, long rowCount,
        long baseCuboidId, long baseCuboidCount, List<Integer> rowKeyColumnLength, long sourceRowCount) {

    int rowkeyLength = cubeSegment.getRowKeyPreambleSize();
    KylinConfig kylinConf = cubeSegment.getConfig();

    long mask = Long.highestOneBit(baseCuboidId);
    long parentCuboidIdActualLength = (long) Long.SIZE - Long.numberOfLeadingZeros(baseCuboidId);
    for (int i = 0; i < parentCuboidIdActualLength; i++) {
        if ((mask & cuboidId) > 0) {
            rowkeyLength += rowKeyColumnLength.get(i); //colIO.getColumnLength(columnList.get(i));
        }
        mask = mask >> 1;
    }

    // add the measure length
    int normalSpace = rowkeyLength;
    int countDistinctSpace = 0;
    double percentileSpace = 0;
    int topNSpace = 0;
    for (MeasureDesc measureDesc : cubeSegment.getCubeDesc().getMeasures()) {
        if (rowCount == 0)
            break;
        DataType returnType = measureDesc.getFunction().getReturnDataType();
        if (measureDesc.getFunction().getExpression().equals(FunctionDesc.FUNC_COUNT_DISTINCT)) {
            long estimateDistinctCount = sourceRowCount / rowCount;
            estimateDistinctCount = estimateDistinctCount == 0 ? 1L : estimateDistinctCount;
            countDistinctSpace += returnType.getStorageBytesEstimate(estimateDistinctCount);
        } else if (measureDesc.getFunction().getExpression().equals(FunctionDesc.FUNC_PERCENTILE)) {
            percentileSpace += returnType.getStorageBytesEstimate(baseCuboidCount * 1.0 / rowCount);
        } else if (measureDesc.getFunction().getExpression().equals(TopNMeasureType.FUNC_TOP_N)) {
            long estimateTopNCount = sourceRowCount / rowCount;
            estimateTopNCount = estimateTopNCount == 0 ? 1L : estimateTopNCount;
            topNSpace += returnType.getStorageBytesEstimate(estimateTopNCount);
        } else {
            normalSpace += returnType.getStorageBytesEstimate();
        }
    }

    double cuboidSizeRatio = kylinConf.getJobCuboidSizeRatio();
    double cuboidSizeMemHungryRatio = kylinConf.getJobCuboidSizeCountDistinctRatio();
    double cuboidSizeTopNRatio = kylinConf.getJobCuboidSizeTopNRatio();

    double ret = (1.0 * normalSpace * rowCount * cuboidSizeRatio
            + 1.0 * countDistinctSpace * rowCount * cuboidSizeMemHungryRatio + 1.0 * percentileSpace * rowCount
            + 1.0 * topNSpace * rowCount * cuboidSizeTopNRatio) / (1024L * 1024L);
    return ret;
}
 
Example 20
Source File: SaveStatisticsStep.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    CubeSegment newSegment = CubingExecutableUtil.findSegment(context,
            CubingExecutableUtil.getCubeName(this.getParams()),
            CubingExecutableUtil.getSegmentId(this.getParams()));
    KylinConfig kylinConf = newSegment.getConfig();

    ResourceStore rs = ResourceStore.getStore(kylinConf);
    try {

        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        Configuration hadoopConf = HadoopUtil.getCurrentConfiguration();
        Path statisticsDir = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        Path[] statisticsFiles = HadoopUtil.getFilteredPath(fs, statisticsDir,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDir);
        }

        Map<Long, HLLCounter> cuboidHLLMap = Maps.newHashMap();
        long totalRowsBeforeMerge = 0;
        long grantTotal = 0;
        int samplingPercentage = -1;
        int mapperNumber = -1;
        for (Path item : statisticsFiles) {
            CubeStatsReader.CubeStatsResult cubeStatsResult = new CubeStatsReader.CubeStatsResult(item,
                    kylinConf.getCubeStatsHLLPrecision());
            cuboidHLLMap.putAll(cubeStatsResult.getCounterMap());
            long pGrantTotal = 0L;
            for (HLLCounter hll : cubeStatsResult.getCounterMap().values()) {
                pGrantTotal += hll.getCountEstimate();
            }
            totalRowsBeforeMerge += pGrantTotal * cubeStatsResult.getMapperOverlapRatio();
            grantTotal += pGrantTotal;
            int pMapperNumber = cubeStatsResult.getMapperNumber();
            if (pMapperNumber > 0) {
                if (mapperNumber < 0) {
                    mapperNumber = pMapperNumber;
                } else {
                    throw new RuntimeException(
                            "Base cuboid has been distributed to multiple reducers at step FactDistinctColumnsReducer!!!");
                }
            }
            int pSamplingPercentage = cubeStatsResult.getPercentage();
            if (samplingPercentage < 0) {
                samplingPercentage = pSamplingPercentage;
            } else if (samplingPercentage != pSamplingPercentage) {
                throw new RuntimeException(
                        "The sampling percentage should be same among all of the reducer of FactDistinctColumnsReducer!!!");
            }
        }
        if (samplingPercentage < 0) {
            logger.warn("The sampling percentage should be set!!!");
        }
        if (mapperNumber < 0) {
            logger.warn("The mapper number should be set!!!");
        }

        if (logger.isDebugEnabled()) {
            logMapperAndCuboidStatistics(cuboidHLLMap, samplingPercentage, mapperNumber, grantTotal,
                    totalRowsBeforeMerge);
        }
        double mapperOverlapRatio = grantTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grantTotal;
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        long sourceRecordCount = cubingJob.findSourceRecordCount();
        CubeStatsWriter.writeCuboidStatistics(hadoopConf, statisticsDir, cuboidHLLMap, samplingPercentage,
                mapperNumber, mapperOverlapRatio, sourceRecordCount);

        Path statisticsFile = new Path(statisticsDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME);
        logger.info("{} stats saved to hdfs {}", newSegment, statisticsFile);

        FSDataInputStream is = fs.open(statisticsFile);
        try {
            // put the statistics to metadata store
            String resPath = newSegment.getStatisticsResourcePath();
            rs.putResource(resPath, is, System.currentTimeMillis());
            logger.info("{} stats saved to resource {}", newSegment, resPath);

            StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, newSegment);
            StatisticsDecisionUtil.optimizeCubingPlan(newSegment);
        } finally {
            IOUtils.closeStream(is);
        }

        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to save cuboid statistics", e);
        return ExecuteResult.createError(e);
    }
}