/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.kylin.engine.mr; import org.apache.kylin.cube.CubeSegment; import org.apache.kylin.cube.cuboid.CuboidModeEnum; import org.apache.kylin.cube.cuboid.CuboidUtil; import org.apache.kylin.engine.mr.common.AbstractHadoopJob; import org.apache.kylin.engine.mr.common.BatchConstants; import org.apache.kylin.engine.mr.common.MapReduceExecutable; import org.apache.kylin.engine.mr.steps.CubingExecutableUtil; import org.apache.kylin.engine.mr.steps.InMemCuboidFromBaseCuboidJob; import org.apache.kylin.engine.mr.steps.NDCuboidJob; import org.apache.kylin.engine.mr.streaming.ColumnToRowJob; import org.apache.kylin.engine.mr.streaming.MergeDictJob; import org.apache.kylin.engine.mr.streaming.SaveDictStep; import org.apache.kylin.job.constant.ExecutableConstants; import org.apache.kylin.job.engine.JobEngineConfig; import org.apache.kylin.job.execution.DefaultChainedExecutable; import org.apache.kylin.stream.core.util.HDFSUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Created by gwang3 on 2017/2/23. */ public class StreamingCubingJobBuilder extends JobBuilderSupport { private static final Logger logger = LoggerFactory.getLogger(StreamingCubingJobBuilder.class); private final IMROutput2.IMRBatchCubingOutputSide2 outputSide; public StreamingCubingJobBuilder(CubeSegment newSegment, String submitter) { super(newSegment, submitter); this.outputSide = MRUtil.getBatchCubingOutputSide2(seg); } public CubingJob build() { logger.info("MR_V2 new job to BUILD streaming segment " + seg); final CubingJob result = CubingJob.createStreamJob(seg, submitter, config); final String jobId = result.getId(); final String streamingStoragePath = getStreamingIndexPath(jobId); final String cuboidRootPath = getCuboidRootPath(jobId); // Phase 1: Merge Dictionaries produced by each streaming receiver result.addTask(createMergeDictStep(streamingStoragePath, jobId, result)); result.addTask(createSaveDictStep(jobId, result)); String tmpBaseCuboidPath = getBaseCuboidPathForStreaming(jobId); //Phase 2: Convert columnar records to row records produced by each streaming receiver, out put as base cuboid addBuildBaseCuboidStep(result, tmpBaseCuboidPath, streamingStoragePath); //Phase 3: Calculate statistics from base cuboid //TODO: reuse optimization file path, need define a own path. result.addTask(createCalculateStatsFromBaseCuboid(tmpBaseCuboidPath, getStatisticsPath(jobId))); result.addTask(createSaveStatisticsStep(jobId)); //create HTable outputSide.addStepPhase2_BuildDictionary(result); // Phase 4: Build Cube // addLayerCubingStepsOnBaseCuboid(result, jobId, cuboidRootPath); // not add layer cubing for streaming cube result.addTask(createInMemCubingStep(jobId, CuboidModeEnum.CURRENT, cuboidRootPath, tmpBaseCuboidPath)); // inmem cubing, only selected algorithm will execute outputSide.addStepPhase3_BuildCube(result); // Phase 5: Update Metadata & Cleanup // result.addTask(createUpdateStreamCubeInfoAfterBuildStep(jobId)); outputSide.addStepPhase4_Cleanup(result); return result; } private void addBuildBaseCuboidStep(final CubingJob result, final String outputBaseCuboidPath, final String streamingStoragePath) { result.addTask(createBaseCuboidStep(streamingStoragePath, outputBaseCuboidPath)); } private void addLayerCubingStepsOnBaseCuboid(final CubingJob result, final String jobId, final String cuboidRootPath) { // Don't know statistics so that tree cuboid scheduler is not determined. Determine the maxLevel at runtime final int maxLevel = CuboidUtil.getLongestDepth(seg.getCuboidScheduler().getAllCuboidIds()); // base cuboid step result.addTask(createBaseCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, 0), jobId)); // n dim cuboid steps for (int i = 1; i <= maxLevel; i++) { result.addTask(createNDimensionCuboidStep(getCuboidOutputPathsByLevel(cuboidRootPath, i - 1), getCuboidOutputPathsByLevel(cuboidRootPath, i), i, jobId)); } } private MapReduceExecutable createMergeDictStep(String streamingStoragePath, String jobId, DefaultChainedExecutable jobFlow) { MapReduceExecutable mergeDict = new MapReduceExecutable(); mergeDict.setName(ExecutableConstants.STEP_NAME_STREAMING_CREATE_DICTIONARY); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd, JobEngineConfig.CUBE_MERGE_JOB_CONF_SUFFIX); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, ExecutableConstants.STEP_NAME_STREAMING_CREATE_DICTIONARY); appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, streamingStoragePath); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_NAME, seg.getName()); //Instead of using mr job output, trySaveNewDict api is used, so output path is useless here appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, getDictPath(jobId)); final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams()); mergeDict.setMapReduceParams(cmd.toString()); mergeDict.setMapReduceJobClass(MergeDictJob.class); mergeDict.setLockPathName(cubeName); mergeDict.setIsNeedLock(true); mergeDict.setIsNeedReleaseLock(false); mergeDict.setJobFlowJobId(jobFlow.getId()); return mergeDict; } private MapReduceExecutable createInMemCubingStep(String jobId, CuboidModeEnum cuboidMode, String cuboidRootPath, String tmpBaseCuboidPath) { MapReduceExecutable cubeStep = new MapReduceExecutable(); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd, JobEngineConfig.IN_MEM_JOB_CONF_SUFFIX); cubeStep.setName(ExecutableConstants.STEP_NAME_BUILD_IN_MEM_CUBE); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid()); appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, tmpBaseCuboidPath); appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, getInMemCuboidPath(cuboidRootPath)); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Cube_Builder_" + seg.getRealization().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBOID_MODE, cuboidMode.toString()); appendExecCmdParameters(cmd, BatchConstants.ARG_UPDATE_SHARD, "true"); cubeStep.setMapReduceParams(cmd.toString()); cubeStep.setMapReduceJobClass(InMemCuboidFromBaseCuboidJob.class); cubeStep.setCounterSaveAs(",," + CubingJob.CUBE_SIZE_BYTES); return cubeStep; } private MapReduceExecutable createNDimensionCuboidStep(String parentPath, String outputPath, int level, String jobId) { // ND cuboid job MapReduceExecutable ndCuboidStep = new MapReduceExecutable(); ndCuboidStep.setName(ExecutableConstants.STEP_NAME_BUILD_N_D_CUBOID + " : level " + level); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_ID, seg.getUuid()); appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, parentPath); appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, outputPath); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_ND-Cuboid_Builder_" + seg.getRealization().getName() + "_Step"); appendExecCmdParameters(cmd, BatchConstants.ARG_LEVEL, "" + level); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBING_JOB_ID, jobId); ndCuboidStep.setMapReduceParams(cmd.toString()); ndCuboidStep.setMapReduceJobClass(getNDCuboidJob()); return ndCuboidStep; } protected Class<? extends AbstractHadoopJob> getNDCuboidJob() { return NDCuboidJob.class; } private MapReduceExecutable createBaseCuboidStep(String streamingStoragePath, String basicCuboidOutputPath) { // base cuboid job MapReduceExecutable baseCuboidStep = new MapReduceExecutable(); StringBuilder cmd = new StringBuilder(); appendMapReduceParameters(cmd, JobEngineConfig.IN_MEM_JOB_CONF_SUFFIX); baseCuboidStep.setName(ExecutableConstants.STEP_NAME_STREAMING_BUILD_BASE_CUBOID); appendExecCmdParameters(cmd, BatchConstants.ARG_CUBE_NAME, seg.getRealization().getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_SEGMENT_NAME, seg.getName()); appendExecCmdParameters(cmd, BatchConstants.ARG_INPUT, streamingStoragePath); appendExecCmdParameters(cmd, BatchConstants.ARG_OUTPUT, basicCuboidOutputPath); appendExecCmdParameters(cmd, BatchConstants.ARG_JOB_NAME, "Kylin_Base_Cuboid_Builder_" + seg.getRealization().getName()); baseCuboidStep.setMapReduceParams(cmd.toString()); baseCuboidStep.setMapReduceJobClass(ColumnToRowJob.class); // TODO need some way to get real source record count from fragment metadata baseCuboidStep.setCounterSaveAs(CubingJob.SOURCE_RECORD_COUNT + "," + CubingJob.SOURCE_SIZE_BYTES); return baseCuboidStep; } private SaveDictStep createSaveDictStep(String jobId, DefaultChainedExecutable jobFlow) { SaveDictStep result = new SaveDictStep(); final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams()); result.setName(ExecutableConstants.STEP_NAME_STREAMING_SAVE_DICTS); CubingExecutableUtil.setCubeName(seg.getRealization().getName(), result.getParams()); CubingExecutableUtil.setSegmentId(seg.getUuid(), result.getParams()); CubingExecutableUtil.setDictsPath(getDictPath(jobId), result.getParams()); CubingExecutableUtil.setCubingJobId(jobId, result.getParams()); result.setIsNeedReleaseLock(true); result.setJobFlowJobId(jobFlow.getId()); result.setLockPathName(cubeName); return result; } public String getStreamingIndexPath(String jobId) { return HDFSUtil.getStreamingSegmentFilePath(seg.getRealization().getName(), seg.getName()); } private String getBaseCuboidPathForStreaming(String jobId) { return getRealizationRootPath(jobId) + "/stream_temp/" + PathNameCuboidBase; } private String getDictPath(String jobId) { return getRealizationRootPath(jobId) + "/dict"; } }