Java Code Examples for org.apache.kylin.cube.CubeInstance#getConfig()

The following examples show how to use org.apache.kylin.cube.CubeInstance#getConfig() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: CalculateStatsFromBaseCuboidReducer.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());

    Configuration conf = context.getConfiguration();
    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    cubeConfig = cube.getConfig();

    baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
    baseCuboidRowCountInMappers = Lists.newLinkedList();

    output = conf.get(BatchConstants.CFG_OUTPUT_PATH);
    samplingPercentage = Integer
            .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));

    taskId = context.getTaskAttemptID().getTaskID().getId();
    cuboidHLLMap = Maps.newHashMap();
}

Example 2

Source File: HiveInputBase.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Override
public void addStepPhase1_CreateFlatTable(DefaultChainedExecutable jobFlow) {
    final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams());
    CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName);
    final KylinConfig cubeConfig = cubeInstance.getConfig();

    final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase);

    // create flat table first
    addStepPhase1_DoCreateFlatTable(jobFlow);

    // create global dict
    KylinConfig dictConfig = (flatDesc.getSegment()).getConfig();
    String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns();
    if (mrHiveDictColumns.length > 0) {
        String globalDictDatabase = dictConfig.getMrHiveDictDB();
        if (null == globalDictDatabase) {
            throw new IllegalArgumentException("Mr-Hive Global dict database is null.");
        }
        String globalDictTable = cubeName + dictConfig.getMrHiveDictTableSuffix();
        addStepPhase1_DoCreateMrHiveGlobalDict(jobFlow, mrHiveDictColumns, globalDictDatabase, globalDictTable);
    }

    // then count and redistribute
    if (cubeConfig.isHiveRedistributeEnabled()) {
        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        //jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor()));
        if (kylinConfig.isLivyEnabled() && cubeInstance.getEngineType() == IEngineAware.ID_SPARK) {
            jobFlow.addTask(createRedistributeFlatHiveTableByLivyStep(hiveInitStatements, cubeName, flatDesc,
                    cubeInstance.getDescriptor()));
        } else {
            jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc,
                    cubeInstance.getDescriptor()));
        }
    }

    // special for hive
    addStepPhase1_DoMaterializeLookupTable(jobFlow);
}

Example 3

Source File: MergeDictionaryStep.java From kylin with Apache License 2.0

5 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager mgr = CubeManager.getInstance(context.getConfig());
    final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment newSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));
    final List<CubeSegment> mergingSegments = getMergingSegments(cube);
    KylinConfig conf = cube.getConfig();

    Collections.sort(mergingSegments);

    try {
        checkLookupSnapshotsMustIncremental(mergingSegments);

        // work on copy instead of cached objects
        CubeInstance cubeCopy = cube.latestCopyForWrite();
        CubeSegment newSegCopy = cubeCopy.getSegmentById(newSegment.getUuid());
        
        makeDictForNewSegment(conf, cubeCopy, newSegCopy, mergingSegments);
        makeSnapshotForNewSegment(cubeCopy, newSegCopy, mergingSegments);

        CubeUpdate update = new CubeUpdate(cubeCopy);
        update.setToUpdateSegs(newSegCopy);
        mgr.updateCube(update);
        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to merge dictionary or lookup snapshots", e);
        return ExecuteResult.createError(e);
    }
}

Example 4

Source File: JobStepFactory.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public static NSparkExecutable addStep(DefaultChainedExecutable parent, JobStepType type,
        CubeInstance cube) {
    NSparkExecutable step;
    KylinConfig config = cube.getConfig();
    switch (type) {
    case RESOURCE_DETECT:
        step = new NResourceDetectStep(parent);
        break;
    case CUBING:
        step = new NSparkCubingStep(config.getSparkBuildClassName());
        break;
    case MERGING:
        step = new NSparkMergingStep(config.getSparkMergeClassName());
        break;
    case CLEAN_UP_AFTER_MERGE:
        step = new NSparkUpdateMetaAndCleanupAfterMergeStep();
        break;
    default:
        throw new IllegalArgumentException();
    }

    step.setParams(parent.getParams());
    step.setProject(parent.getProject());
    step.setTargetSubject(parent.getTargetSubject());
    if (step instanceof NSparkUpdateMetaAndCleanupAfterMergeStep) {
        CubeSegment mergeSegment = cube.getSegmentById(parent.getTargetSegments().iterator().next());
        final Segments<CubeSegment> mergingSegments = cube.getMergingSegments(mergeSegment);
        step.setParam(MetadataConstants.P_SEGMENT_NAMES,
                String.join(",", NSparkCubingUtil.toSegmentNames(mergingSegments)));
        step.setParam(CubingExecutableUtil.SEGMENT_ID, parent.getParam(CubingExecutableUtil.SEGMENT_ID));
        step.setParam(MetadataConstants.P_JOB_TYPE, parent.getParam(MetadataConstants.P_JOB_TYPE));
        step.setParam(MetadataConstants.P_OUTPUT_META_URL, parent.getParam(MetadataConstants.P_OUTPUT_META_URL));
    }
    parent.addTask(step);
    //after addTask, step's id is changed
    step.setDistMetaUrl(config.getJobTmpMetaStoreUrl(parent.getProject(), step.getId()));
    return step;
}

Example 5

Source File: LookupSnapshotJobBuilder.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public LookupSnapshotJobBuilder(CubeInstance cube, String lookupTable, List<String> segments, String submitter) {
    this.cube = cube;
    this.lookupTable = lookupTable;
    this.segments = segments;
    this.submitter = submitter;
    this.kylinConfig = cube.getConfig();
}

Example 6

Source File: CuboidRecommenderUtil.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

/** Trigger cube planner phase two for optimization */
public static Map<Long, Long> getRecommendCuboidList(CubeInstance cube, Map<Long, Long> hitFrequencyMap,
        Map<Long, Map<Long, Pair<Long, Long>>> rollingUpCountSourceMap) throws IOException {

    CuboidScheduler cuboidScheduler = cube.getCuboidScheduler();
    Set<Long> currentCuboids = cuboidScheduler.getAllCuboidIds();
    Pair<Map<Long, Long>, Map<Long, Double>> statsPair = CuboidStatsReaderUtil
            .readCuboidStatsAndSizeFromCube(currentCuboids, cube);
    long baseCuboid = cuboidScheduler.getBaseCuboidId();
    if (statsPair.getFirst().get(baseCuboid) == null || statsPair.getFirst().get(baseCuboid) == 0L) {
        logger.info(BASE_CUBOID_COUNT_IN_CUBOID_STATISTICS_IS_ZERO);
        return null;
    }

    KylinConfig config = cube.getConfig();
    String key = cube.getName();
    double queryUncertaintyRatio = config.getCubePlannerQueryUncertaintyRatio();
    double bpusMinBenefitRatio = config.getCubePlannerBPUSMinBenefitRatio();
    CuboidStats cuboidStats = new CuboidStats.Builder(key, baseCuboid, statsPair.getFirst(),
            statsPair.getSecond()) {
        @Override
        public Map<Long, Double> estimateCuboidsSize(Map<Long, Long> statistics) {
            try {
                return CuboidStatsReaderUtil.readCuboidSizeFromCube(statistics, cube);
            } catch (IOException e) {
                logger.warn("Fail to get cuboid size from cube due to ", e);
                return null;
            }
        }
    }.setQueryUncertaintyRatio(queryUncertaintyRatio) //
            .setBPUSMinBenefitRatio(bpusMinBenefitRatio) //
            .setHitFrequencyMap(hitFrequencyMap) //
            .setRollingUpCountSourceMap(rollingUpCountSourceMap) //
            .build();
    return CuboidRecommender.getInstance().getRecommendCuboidList(cuboidStats, config);
}

Example 7

Source File: CuboidRecommenderUtil.java From kylin with Apache License 2.0

5 votes

/** Trigger cube planner phase two for optimization */
public static Map<Long, Long> getRecommendCuboidList(CubeInstance cube, Map<Long, Long> hitFrequencyMap,
        Map<Long, Map<Long, Pair<Long, Long>>> rollingUpCountSourceMap) throws IOException {

    CuboidScheduler cuboidScheduler = cube.getCuboidScheduler();
    Set<Long> currentCuboids = cuboidScheduler.getAllCuboidIds();
    Pair<Map<Long, Long>, Map<Long, Double>> statsPair = CuboidStatsReaderUtil
            .readCuboidStatsAndSizeFromCube(currentCuboids, cube);
    long baseCuboid = cuboidScheduler.getBaseCuboidId();
    if (statsPair.getFirst().get(baseCuboid) == null || statsPair.getFirst().get(baseCuboid) == 0L) {
        logger.info(BASE_CUBOID_COUNT_IN_CUBOID_STATISTICS_IS_ZERO);
        return null;
    }

    KylinConfig config = cube.getConfig();
    String key = cube.getName();
    double queryUncertaintyRatio = config.getCubePlannerQueryUncertaintyRatio();
    double bpusMinBenefitRatio = config.getCubePlannerBPUSMinBenefitRatio();
    CuboidStats cuboidStats = new CuboidStats.Builder(key, baseCuboid, statsPair.getFirst(),
            statsPair.getSecond()) {
        @Override
        public Map<Long, Double> estimateCuboidsSize(Map<Long, Long> statistics) {
            try {
                return CuboidStatsReaderUtil.readCuboidSizeFromCube(statistics, cube);
            } catch (IOException e) {
                logger.warn("Fail to get cuboid size from cube due to ", e);
                return null;
            }
        }
    }.setQueryUncertaintyRatio(queryUncertaintyRatio) //
            .setBPUSMinBenefitRatio(bpusMinBenefitRatio) //
            .setHitFrequencyMap(hitFrequencyMap) //
            .setRollingUpCountSourceMap(rollingUpCountSourceMap) //
            .build();
    return CuboidRecommender.getInstance().getRecommendCuboidList(cuboidStats, config);
}

Example 8

Source File: FactDistinctColumnsReducer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    cubeConfig = cube.getConfig();
    cubeDesc = cube.getDescriptor();

    taskId = context.getTaskAttemptID().getTaskID().getId();

    reducerMapping = new FactDistinctColumnsReducerMapping(cube);

    logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

    if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
        // hll
        isStatistics = true;
        baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
        baseCuboidRowCountInMappers = Lists.newArrayList();
        cuboidHLLMap = Maps.newHashMap();
        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
        logger.info("Reducer " + taskId + " handling stats");
    } else {
        // normal col
        col = reducerMapping.getColForReducer(taskId);
        Preconditions.checkNotNull(col);

        // local build dict
        buildDictInReducer = config.isBuildDictInReducerEnabled();
        if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
            buildDictInReducer = false;
        }
        if (reducerMapping.getReducerNumForDimCol(col) > 1) {
            buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
        }
        if (buildDictInReducer) {
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        }
        logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
    }
}

Example 9

Source File: HBaseLookupMRSteps.java From kylin with Apache License 2.0

4 votes

public HBaseLookupMRSteps(CubeInstance cube) {
    this.cube = cube;
    this.config = new JobEngineConfig(cube.getConfig());
}

Example 10

Source File: MergeStatisticsWithOldStep.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager mgr = CubeManager.getInstance(context.getConfig());
    final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment optimizeSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));

    CubeSegment oldSegment = optimizeSegment.getCubeInstance().getOriginalSegmentToOptimize(optimizeSegment);
    Preconditions.checkNotNull(oldSegment,
            "cannot find the original segment to be optimized by " + optimizeSegment);

    KylinConfig kylinConf = cube.getConfig();
    Configuration conf = HadoopUtil.getCurrentConfiguration();
    ResourceStore rs = ResourceStore.getStore(kylinConf);
    int averageSamplingPercentage = 0;

    try {
        //1. Add statistics from optimized segment
        Path statisticsDirPath = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        FileSystem hdfs = FileSystem.get(conf);
        if (!hdfs.exists(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " does not exists");
        }

        if (!hdfs.isDirectory(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " is not a directory");
        }

        Path[] statisticsFiles = HadoopUtil.getFilteredPath(hdfs, statisticsDirPath,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDirPath);
        }

        for (Path item : statisticsFiles) {
            CubeStatsReader optimizeSegmentStatsReader = new CubeStatsReader(optimizeSegment, null,
                    optimizeSegment.getConfig(), item);
            averageSamplingPercentage += optimizeSegmentStatsReader.getSamplingPercentage();
            addFromCubeStatsReader(optimizeSegmentStatsReader);
        }

        //2. Add statistics from old segment
        CubeStatsReader oldSegmentStatsReader = new CubeStatsReader(oldSegment, null, oldSegment.getConfig());
        averageSamplingPercentage += oldSegmentStatsReader.getSamplingPercentage();
        addFromCubeStatsReader(oldSegmentStatsReader);

        logger.info("Cuboid set with stats info: " + cuboidHLLMap.keySet().toString());
        //3. Store merged statistics for recommend cuboids
        averageSamplingPercentage = averageSamplingPercentage / 2;
        Set<Long> cuboidsRecommend = cube.getCuboidsRecommend();

        Map<Long, HLLCounter> resultCuboidHLLMap = Maps.newHashMapWithExpectedSize(cuboidsRecommend.size());
        for (Long cuboid : cuboidsRecommend) {
            HLLCounter hll = cuboidHLLMap.get(cuboid);
            if (hll == null) {
                logger.warn("Cannot get the row count stats for cuboid " + cuboid);
            } else {
                resultCuboidHLLMap.put(cuboid, hll);
            }
        }

        String resultDir = CubingExecutableUtil.getMergedStatisticsPath(this.getParams());
        CubeStatsWriter.writeCuboidStatistics(conf, new Path(resultDir), resultCuboidHLLMap,
                averageSamplingPercentage, oldSegmentStatsReader.getSourceRowCount());

        try (FSDataInputStream mergedStats = hdfs
                .open(new Path(resultDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME))) {
            // put the statistics to metadata store
            String statisticsFileName = optimizeSegment.getStatisticsResourcePath();
            rs.putResource(statisticsFileName, mergedStats, System.currentTimeMillis());
        }

        //By default, the cube optimization will use in-memory cubing
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, optimizeSegment);

        return new ExecuteResult();
    } catch (IOException e) {
        logger.error("fail to merge cuboid statistics", e);
        return ExecuteResult.createError(e);
    }

}

Example 11

Source File: SparkExecutableLivy.java From kylin with Apache License 2.0

4 votes

@SuppressWarnings("checkstyle:methodlength")
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    ExecutableManager mgr = getManager();
    Map<String, String> extra = mgr.getOutput(getId()).getExtra();
    String sparkJobId = extra.get(ExecutableConstants.SPARK_JOB_ID);
    if (!StringUtils.isEmpty(sparkJobId)) {
        return onResumed(sparkJobId, mgr);
    } else {
        String cubeName = this.getParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt());
        CubeInstance cube = CubeManager.getInstance(context.getConfig()).getCube(cubeName);
        final KylinConfig config = cube.getConfig();

        setAlgorithmLayer();

        LivyRestBuilder livyRestBuilder = new LivyRestBuilder();

        String segmentID = this.getParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt());
        CubeSegment segment = cube.getSegmentById(segmentID);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);
        dumpMetadata(segment, mergingSeg);

        Map<String, String> sparkConfs = config.getSparkConfigOverride();
        String sparkConfigName = getSparkConfigName();
        if (sparkConfigName != null) {
            Map<String, String> sparkSpecificConfs = config.getSparkConfigOverrideWithSpecificName(sparkConfigName);
            sparkConfs.putAll(sparkSpecificConfs);
        }

        for (Map.Entry<String, String> entry : sparkConfs.entrySet()) {
            if (entry.getKey().equals("spark.submit.deployMode") || entry.getKey().equals("spark.master")
                    || entry.getKey().equals("spark.yarn.archive")) {
                continue;
            } else {
                livyRestBuilder.addConf(entry.getKey(), entry.getValue());
            }
        }
        formatArgs(livyRestBuilder.getArgs());

        final LivyRestExecutor executor = new LivyRestExecutor();
        final PatternedLogger patternedLogger = new PatternedLogger(logger, (infoKey, info) -> {
            // only care three properties here
            if (ExecutableConstants.SPARK_JOB_ID.equals(infoKey) || ExecutableConstants.YARN_APP_ID.equals(infoKey)
                    || ExecutableConstants.YARN_APP_URL.equals(infoKey)) {
                getManager().addJobInfo(getId(), info);
            }
        });

        try {
            livyRestBuilder.setLivyTypeEnum(LivyTypeEnum.job);
            executor.execute(livyRestBuilder, patternedLogger);
            if (isDiscarded()) {
                return new ExecuteResult(ExecuteResult.State.DISCARDED, "Discarded");
            }
            if (isPaused()) {
                return new ExecuteResult(ExecuteResult.State.STOPPED, "Stopped");
            }
            // done, update all properties
            Map<String, String> joblogInfo = patternedLogger.getInfo();
            // read counter from hdfs
            String counterOutput = getParam(BatchConstants.ARG_COUNTER_OUTPUT);
            if (counterOutput != null) {
                if (HadoopUtil.getWorkingFileSystem().exists(new Path(counterOutput))) {
                    Map<String, String> counterMap = HadoopUtil.readFromSequenceFile(counterOutput);
                    joblogInfo.putAll(counterMap);
                } else {
                    logger.warn("Spark counter output path not exists: " + counterOutput);
                }
            }
            readCounters(joblogInfo);
            getManager().addJobInfo(getId(), joblogInfo);
            return new ExecuteResult(ExecuteResult.State.SUCCEED, patternedLogger.getBufferedLog());

        } catch (Exception e) {
            logger.error("error run spark job:", e);
            // clear SPARK_JOB_ID on job failure.
            extra = mgr.getOutput(getId()).getExtra();
            extra.put(ExecutableConstants.SPARK_JOB_ID, "");
            getManager().addJobInfo(getId(), extra);
            return new ExecuteResult(ExecuteResult.State.ERROR, e.getMessage());
        }
    }
}

Example 12

Source File: HBaseLookupMRSteps.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public HBaseLookupMRSteps(CubeInstance cube) {
    this.cube = cube;
    this.config = new JobEngineConfig(cube.getConfig());
}

Example 13

Source File: Cuboid.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public static void clearCache(CubeInstance cubeInstance) {
    KylinConfig config = cubeInstance.getConfig();
    CuboidManager.getInstance(config).clearCache(cubeInstance);
}

Example 14

Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}

Example 15

Source File: SparkExecutableLivy.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@SuppressWarnings("checkstyle:methodlength")
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    ExecutableManager mgr = getManager();
    Map<String, String> extra = mgr.getOutput(getId()).getExtra();
    String sparkJobId = extra.get(ExecutableConstants.SPARK_JOB_ID);
    if (!StringUtils.isEmpty(sparkJobId)) {
        return onResumed(sparkJobId, mgr);
    } else {
        String cubeName = this.getParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt());
        CubeInstance cube = CubeManager.getInstance(context.getConfig()).getCube(cubeName);
        final KylinConfig config = cube.getConfig();

        setAlgorithmLayer();

        LivyRestBuilder livyRestBuilder = new LivyRestBuilder();

        String segmentID = this.getParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt());
        CubeSegment segment = cube.getSegmentById(segmentID);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);
        dumpMetadata(segment, mergingSeg);

        Map<String, String> sparkConfs = config.getSparkConfigOverride();
        String sparkConfigName = getSparkConfigName();
        if (sparkConfigName != null) {
            Map<String, String> sparkSpecificConfs = config.getSparkConfigOverrideWithSpecificName(sparkConfigName);
            sparkConfs.putAll(sparkSpecificConfs);
        }

        for (Map.Entry<String, String> entry : sparkConfs.entrySet()) {
            if (entry.getKey().equals("spark.submit.deployMode") || entry.getKey().equals("spark.master")
                    || entry.getKey().equals("spark.yarn.archive")) {
                continue;
            } else {
                livyRestBuilder.addConf(entry.getKey(), entry.getValue());
            }
        }
        formatArgs(livyRestBuilder.getArgs());

        final LivyRestExecutor executor = new LivyRestExecutor();
        final PatternedLogger patternedLogger = new PatternedLogger(logger, (infoKey, info) -> {
            // only care three properties here
            if (ExecutableConstants.SPARK_JOB_ID.equals(infoKey) || ExecutableConstants.YARN_APP_ID.equals(infoKey)
                    || ExecutableConstants.YARN_APP_URL.equals(infoKey)) {
                getManager().addJobInfo(getId(), info);
            }
        });

        try {
            livyRestBuilder.setLivyTypeEnum(LivyTypeEnum.job);
            executor.execute(livyRestBuilder, patternedLogger);
            if (isDiscarded()) {
                return new ExecuteResult(ExecuteResult.State.DISCARDED, "Discarded");
            }
            if (isPaused()) {
                return new ExecuteResult(ExecuteResult.State.STOPPED, "Stopped");
            }
            // done, update all properties
            Map<String, String> joblogInfo = patternedLogger.getInfo();
            // read counter from hdfs
            String counterOutput = getParam(BatchConstants.ARG_COUNTER_OUTPUT);
            if (counterOutput != null) {
                if (HadoopUtil.getWorkingFileSystem().exists(new Path(counterOutput))) {
                    Map<String, String> counterMap = HadoopUtil.readFromSequenceFile(counterOutput);
                    joblogInfo.putAll(counterMap);
                } else {
                    logger.warn("Spark counter output path not exists: " + counterOutput);
                }
            }
            readCounters(joblogInfo);
            getManager().addJobInfo(getId(), joblogInfo);
            return new ExecuteResult(ExecuteResult.State.SUCCEED, patternedLogger.getBufferedLog());

        } catch (Exception e) {
            logger.error("error run spark job:", e);
            // clear SPARK_JOB_ID on job failure.
            extra = mgr.getOutput(getId()).getExtra();
            extra.put(ExecutableConstants.SPARK_JOB_ID, "");
            getManager().addJobInfo(getId(), extra);
            return new ExecuteResult(ExecuteResult.State.ERROR, e.getMessage());
        }
    }
}

Example 16

Source File: SparkFactDistinct.java From kylin with Apache License 2.0

4 votes

private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}

Example 17

Source File: AbstractExecutable.java From kylin with Apache License 2.0

4 votes

public KylinConfig getCubeSpecificConfig() {
    String cubeName = getCubeName();
    CubeManager manager = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
    CubeInstance cube = manager.getCube(cubeName);
    return cube.getConfig();
}

Example 18

Source File: UpdateDictionaryStep.java From kylin with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager cubeMgr = CubeManager.getInstance(context.getConfig());
    final DictionaryManager dictMgrHdfs;
    final DictionaryManager dictMgrHbase;
    final CubeInstance cube = cubeMgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment newSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));
    final List<CubeSegment> mergingSegments = getMergingSegments(cube);
    final String dictInfoPath = this.getParams().get(BatchConstants.ARG_DICT_PATH);
    final String metadataUrl = this.getParams().get(BatchConstants.ARG_META_URL);

    final KylinConfig kylinConfHbase = cube.getConfig();
    final KylinConfig kylinConfHdfs = AbstractHadoopJob.loadKylinConfigFromHdfs(metadataUrl);

    Collections.sort(mergingSegments);

    try {
        Configuration conf = HadoopUtil.getCurrentConfiguration();
        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        ResourceStore hbaseRS = ResourceStore.getStore(kylinConfHbase);
        ResourceStore hdfsRS = ResourceStore.getStore(kylinConfHdfs);
        dictMgrHdfs = DictionaryManager.getInstance(kylinConfHdfs);
        dictMgrHbase = DictionaryManager.getInstance(kylinConfHbase);

        // work on copy instead of cached objects
        CubeInstance cubeCopy = cube.latestCopyForWrite();
        CubeSegment newSegCopy = cubeCopy.getSegmentById(newSegment.getUuid());

        // update cube segment dictionary

        FileStatus[] fileStatuss = fs.listStatus(new Path(dictInfoPath), new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.getName().startsWith("part") || path.getName().startsWith("tmp");
            }
        });

        for (FileStatus fileStatus : fileStatuss) {
            Path filePath = fileStatus.getPath();

            SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, conf);
            Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            Text value = (Text) ReflectionUtils.newInstance(reader.getValueClass(), conf);

            while (reader.next(key, value)) {
                String tblCol = key.toString();
                String dictInfoResource = value.toString();

                if (StringUtils.isNotEmpty(dictInfoResource)) {
                    logger.info(dictInfoResource);
                    // put dictionary file to metadata store
                    DictionaryInfo dictInfoHdfs = dictMgrHdfs.getDictionaryInfo(dictInfoResource);
                    DictionaryInfo dicInfoHbase = dictMgrHbase.trySaveNewDict(dictInfoHdfs.getDictionaryObject(), dictInfoHdfs);

                    if (dicInfoHbase != null){
                        TblColRef tblColRef = cube.getDescriptor().findColumnRef(tblCol.split(":")[0], tblCol.split(":")[1]);
                        newSegCopy.putDictResPath(tblColRef, dicInfoHbase.getResourcePath());
                    }
                }
            }

            IOUtils.closeStream(reader);
        }

        CubeSegment lastSeg = mergingSegments.get(mergingSegments.size() - 1);
        for (Map.Entry<String, String> entry : lastSeg.getSnapshots().entrySet()) {
            newSegCopy.putSnapshotResPath(entry.getKey(), entry.getValue());
        }

        // update statistics
        // put the statistics to metadata store
        String statisticsFileName = newSegment.getStatisticsResourcePath();
        hbaseRS.putResource(statisticsFileName, hdfsRS.getResource(newSegment.getStatisticsResourcePath()).content(), System.currentTimeMillis());

        CubeUpdate update = new CubeUpdate(cubeCopy);
        update.setToUpdateSegs(newSegCopy);
        cubeMgr.updateCube(update);

        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to merge dictionary", e);
        return ExecuteResult.createError(e);
    }
}

Example 19

Source File: MergeStatisticsWithOldStep.java From kylin with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager mgr = CubeManager.getInstance(context.getConfig());
    final CubeInstance cube = mgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment optimizeSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));

    CubeSegment oldSegment = optimizeSegment.getCubeInstance().getOriginalSegmentToOptimize(optimizeSegment);
    Preconditions.checkNotNull(oldSegment,
            "cannot find the original segment to be optimized by " + optimizeSegment);

    KylinConfig kylinConf = cube.getConfig();
    Configuration conf = HadoopUtil.getCurrentConfiguration();
    ResourceStore rs = ResourceStore.getStore(kylinConf);
    int averageSamplingPercentage = 0;

    try {
        //1. Add statistics from optimized segment
        Path statisticsDirPath = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        FileSystem hdfs = FileSystem.get(conf);
        if (!hdfs.exists(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " does not exists");
        }

        if (!hdfs.isDirectory(statisticsDirPath)) {
            throw new IOException("StatisticsFilePath " + statisticsDirPath + " is not a directory");
        }

        Path[] statisticsFiles = HadoopUtil.getFilteredPath(hdfs, statisticsDirPath,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDirPath);
        }

        for (Path item : statisticsFiles) {
            CubeStatsReader optimizeSegmentStatsReader = new CubeStatsReader(optimizeSegment, null,
                    optimizeSegment.getConfig(), item);
            averageSamplingPercentage += optimizeSegmentStatsReader.getSamplingPercentage();
            addFromCubeStatsReader(optimizeSegmentStatsReader);
        }

        //2. Add statistics from old segment
        CubeStatsReader oldSegmentStatsReader = new CubeStatsReader(oldSegment, null, oldSegment.getConfig());
        averageSamplingPercentage += oldSegmentStatsReader.getSamplingPercentage();
        addFromCubeStatsReader(oldSegmentStatsReader);

        logger.info("Cuboid set with stats info: " + cuboidHLLMap.keySet().toString());
        //3. Store merged statistics for recommend cuboids
        averageSamplingPercentage = averageSamplingPercentage / 2;
        Set<Long> cuboidsRecommend = cube.getCuboidsRecommend();

        Map<Long, HLLCounter> resultCuboidHLLMap = Maps.newHashMapWithExpectedSize(cuboidsRecommend.size());
        for (Long cuboid : cuboidsRecommend) {
            HLLCounter hll = cuboidHLLMap.get(cuboid);
            if (hll == null) {
                logger.warn("Cannot get the row count stats for cuboid " + cuboid);
            } else {
                resultCuboidHLLMap.put(cuboid, hll);
            }
        }

        String resultDir = CubingExecutableUtil.getMergedStatisticsPath(this.getParams());
        CubeStatsWriter.writeCuboidStatistics(conf, new Path(resultDir), resultCuboidHLLMap,
                averageSamplingPercentage, oldSegmentStatsReader.getSourceRowCount());

        try (FSDataInputStream mergedStats = hdfs
                .open(new Path(resultDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME))) {
            // put the statistics to metadata store
            String statisticsFileName = optimizeSegment.getStatisticsResourcePath();
            rs.putResource(statisticsFileName, mergedStats, System.currentTimeMillis());
        }

        //By default, the cube optimization will use in-memory cubing
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, optimizeSegment);

        return new ExecuteResult();
    } catch (IOException e) {
        logger.error("fail to merge cuboid statistics", e);
        return ExecuteResult.createError(e);
    }

}

Example 20

Source File: FactDistinctColumnsReducer.java From kylin with Apache License 2.0

4 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    cubeConfig = cube.getConfig();
    cubeDesc = cube.getDescriptor();

    taskId = context.getTaskAttemptID().getTaskID().getId();

    reducerMapping = new FactDistinctColumnsReducerMapping(cube);

    logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

    if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
        // hll
        isStatistics = true;
        baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
        baseCuboidRowCountInMappers = Lists.newArrayList();
        cuboidHLLMap = Maps.newHashMap();
        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
        logger.info("Reducer " + taskId + " handling stats");
    } else {
        // normal col
        col = reducerMapping.getColForReducer(taskId);
        Preconditions.checkNotNull(col);

        // local build dict
        buildDictInReducer = config.isBuildDictInReducerEnabled();
        if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
            buildDictInReducer = false;
        }
        if (reducerMapping.getReducerNumForDimCol(col) > 1) {
            buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
        }
        if (buildDictInReducer) {
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        }
        logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
    }
}