Java Code Examples for org.apache.kylin.cube.CubeInstance#getMergingSegments()

The following examples show how to use org.apache.kylin.cube.CubeInstance#getMergingSegments() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CubeMergeJob.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
@Override
protected void doExecute() throws Exception {
    buildLayoutWithUpdate = new BuildLayoutWithUpdate();
    String cubeId = getParam(MetadataConstants.P_CUBE_ID);
    String newSegmentId = getParam(MetadataConstants.P_SEGMENT_IDS);
    final CubeManager cubeManager = CubeManager.getInstance(config);
    final CubeInstance cube = cubeManager.getCubeByUuid(cubeId);
    final CubeSegment mergedSeg = cube.getSegmentById(newSegmentId);
    mergingSegments = cube.getMergingSegments(mergedSeg);
    for (CubeSegment segment : mergingSegments) {
        SegmentInfo segInfo = ManagerHub.getSegmentInfo(config, getParam(MetadataConstants.P_CUBE_ID), segment.getUuid());
        mergingSegInfos.add(segInfo);
    }
    //merge and save segments
    mergeSegments(cubeId, newSegmentId);
}
 
Example 2
Source File: MergeOffsetStep.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager cubeManager = CubeManager.getInstance(context.getConfig());
    final CubeInstance cubeCopy = cubeManager.getCube(CubingExecutableUtil.getCubeName(this.getParams())).latestCopyForWrite();
    final String segmentId = CubingExecutableUtil.getSegmentId(this.getParams());
    final CubeSegment segCopy = cubeCopy.getSegmentById(segmentId);

    Preconditions.checkNotNull(segCopy, "Cube segment '" + segmentId + "' not found.");
    Segments<CubeSegment> mergingSegs = cubeCopy.getMergingSegments(segCopy);

    Preconditions.checkArgument(mergingSegs.size() > 0, "Merging segment not exist.");

    Collections.sort(mergingSegs);
    final CubeSegment first = mergingSegs.get(0);
    final CubeSegment last = mergingSegs.get(mergingSegs.size() - 1);

    segCopy.setSegRange(new SegmentRange(first.getSegRange().start, last.getSegRange().end));
    segCopy.setSourcePartitionOffsetStart(first.getSourcePartitionOffsetStart());
    segCopy.setSourcePartitionOffsetEnd(last.getSourcePartitionOffsetEnd());

    segCopy.setTSRange(new TSRange(mergingSegs.getTSStart(), mergingSegs.getTSEnd()));

    CubeUpdate update = new CubeUpdate(cubeCopy);
    update.setToUpdateSegs(segCopy);
    try {
        cubeManager.updateCube(update);
        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to update cube segment offset", e);
        return ExecuteResult.createError(e);
    }
}
 
Example 3
Source File: JobStepFactory.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public static NSparkExecutable addStep(DefaultChainedExecutable parent, JobStepType type,
        CubeInstance cube) {
    NSparkExecutable step;
    KylinConfig config = cube.getConfig();
    switch (type) {
    case RESOURCE_DETECT:
        step = new NResourceDetectStep(parent);
        break;
    case CUBING:
        step = new NSparkCubingStep(config.getSparkBuildClassName());
        break;
    case MERGING:
        step = new NSparkMergingStep(config.getSparkMergeClassName());
        break;
    case CLEAN_UP_AFTER_MERGE:
        step = new NSparkUpdateMetaAndCleanupAfterMergeStep();
        break;
    default:
        throw new IllegalArgumentException();
    }

    step.setParams(parent.getParams());
    step.setProject(parent.getProject());
    step.setTargetSubject(parent.getTargetSubject());
    if (step instanceof NSparkUpdateMetaAndCleanupAfterMergeStep) {
        CubeSegment mergeSegment = cube.getSegmentById(parent.getTargetSegments().iterator().next());
        final Segments<CubeSegment> mergingSegments = cube.getMergingSegments(mergeSegment);
        step.setParam(MetadataConstants.P_SEGMENT_NAMES,
                String.join(",", NSparkCubingUtil.toSegmentNames(mergingSegments)));
        step.setParam(CubingExecutableUtil.SEGMENT_ID, parent.getParam(CubingExecutableUtil.SEGMENT_ID));
        step.setParam(MetadataConstants.P_JOB_TYPE, parent.getParam(MetadataConstants.P_JOB_TYPE));
        step.setParam(MetadataConstants.P_OUTPUT_META_URL, parent.getParam(MetadataConstants.P_OUTPUT_META_URL));
    }
    parent.addTask(step);
    //after addTask, step's id is changed
    step.setDistMetaUrl(config.getJobTmpMetaStoreUrl(parent.getProject(), step.getId()));
    return step;
}
 
Example 4
Source File: ResourceDetectBeforeMergingJob.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
protected void doExecute() throws Exception {
    logger.info("Start detect resource before merge.");
    String cubeId = getParam(MetadataConstants.P_CUBE_ID);

    final CubeManager cubeManager = CubeManager.getInstance(config);
    final CubeInstance cube = cubeManager.getCubeByUuid(cubeId);
    final CubeSegment mergedSeg = cube.getSegmentById(getParam(MetadataConstants.P_SEGMENT_IDS));
    final SegmentInfo mergedSegInfo = MetadataConverter.getSegmentInfo(cube, mergedSeg.getUuid(),
            mergedSeg.getName(), mergedSeg.getStorageLocationIdentifier());
    final List<CubeSegment> mergingSegments = cube.getMergingSegments(mergedSeg);
    final List<SegmentInfo> segmentInfos = Lists.newArrayList();
    Collections.sort(mergingSegments);
    for (CubeSegment cubeSegment : mergingSegments) {
        segmentInfos.add(MetadataConverter.getSegmentInfo(cube, cubeSegment.getUuid(), cubeSegment.getName(),
                cubeSegment.getStorageLocationIdentifier()));
    }
    infos.clearMergingSegments();
    infos.recordMergingSegments(segmentInfos);
    Map<Long, DFLayoutMergeAssist> mergeCuboidsAssist = CubeMergeJob.generateMergeAssist(segmentInfos, ss);
    ResourceDetectUtils.write(
            new Path(config.getJobTmpShareDir(project, jobId), ResourceDetectUtils.countDistinctSuffix()),
            ResourceDetectUtils
                    .findCountDistinctMeasure(JavaConversions.asJavaCollection(mergedSegInfo.toBuildLayouts())));
    Map<String, List<String>> resourcePaths = Maps.newHashMap();
    infos.clearSparkPlans();
    for (Map.Entry<Long, DFLayoutMergeAssist> entry : mergeCuboidsAssist.entrySet()) {
        Dataset<Row> afterMerge = entry.getValue().merge(config, getParam(MetadataConstants.P_CUBE_NAME));
        infos.recordSparkPlan(afterMerge.queryExecution().sparkPlan());
        List<Path> paths = JavaConversions
                .seqAsJavaList(ResourceDetectUtils.getPaths(afterMerge.queryExecution().sparkPlan()));
        List<String> pathStrs = paths.stream().map(Path::toString).collect(Collectors.toList());
        resourcePaths.put(String.valueOf(entry.getKey()), pathStrs);
    }
    ResourceDetectUtils.write(new Path(config.getJobTmpShareDir(project, jobId),
            mergedSeg.getUuid() + "_" + ResourceDetectUtils.fileName()), resourcePaths);
}
 
Example 5
Source File: MergeOffsetStep.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager cubeManager = CubeManager.getInstance(context.getConfig());
    final CubeInstance cubeCopy = cubeManager.getCube(CubingExecutableUtil.getCubeName(this.getParams())).latestCopyForWrite();
    final String segmentId = CubingExecutableUtil.getSegmentId(this.getParams());
    final CubeSegment segCopy = cubeCopy.getSegmentById(segmentId);

    Preconditions.checkNotNull(segCopy, "Cube segment '" + segmentId + "' not found.");
    Segments<CubeSegment> mergingSegs = cubeCopy.getMergingSegments(segCopy);

    Preconditions.checkArgument(mergingSegs.size() > 0, "Merging segment not exist.");

    Collections.sort(mergingSegs);
    final CubeSegment first = mergingSegs.get(0);
    final CubeSegment last = mergingSegs.get(mergingSegs.size() - 1);

    segCopy.setSegRange(new SegmentRange(first.getSegRange().start, last.getSegRange().end));
    segCopy.setSourcePartitionOffsetStart(first.getSourcePartitionOffsetStart());
    segCopy.setSourcePartitionOffsetEnd(last.getSourcePartitionOffsetEnd());

    segCopy.setTSRange(new TSRange(mergingSegs.getTSStart(), mergingSegs.getTSEnd()));

    CubeUpdate update = new CubeUpdate(cubeCopy);
    update.setToUpdateSegs(segCopy);
    try {
        cubeManager.updateCube(update);
        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to update cube segment offset", e);
        return ExecuteResult.createError(e);
    }
}
 
Example 6
Source File: MergeCuboidJob.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        String input = getOptionValue(OPTION_INPUT_PATH);
        String output = getOptionValue(OPTION_OUTPUT_PATH);
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment cubeSeg = cube.getSegmentById(segmentID);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        logger.info("Starting: " + jobName);
        job = Job.getInstance(getConf(), jobName);

        setJobClasspath(job, cube.getConfig());

        // add metadata to distributed cache
        Segments<CubeSegment> allSegs = cube.getMergingSegments(cubeSeg);
        allSegs.add(cubeSeg);
        attachSegmentsMetadataWithDict(allSegs, job.getConfiguration());

        // Mapper
        job.setMapperClass(MergeCuboidMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Reducer
        job.setReducerClass(CuboidReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // set inputs
        IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(cubeSeg).getOutputFormat();
        outputFormat.configureJobInput(job, input);
        addInputDirs(input, job);

        // set output
        outputFormat.configureJobOutput(job, output, cubeSeg);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 7
Source File: MergeDictionaryJob.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    try {
        Options options = new Options();
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_META_URL);
        options.addOption(OPTION_MERGE_SEGMENT_IDS);
        options.addOption(OPTION_OUTPUT_PATH_DICT);
        options.addOption(OPTION_OUTPUT_PATH_STAT);
        parseOptions(options, args);

        final String segmentId = getOptionValue(OPTION_SEGMENT_ID);
        final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS);
        final String cubeName = getOptionValue(OPTION_CUBE_NAME);
        final String metaUrl = getOptionValue(OPTION_META_URL);
        final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT);
        final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        CubeSegment segment = cube.getSegmentById(segmentId);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName);
        job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl);
        job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId);
        job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds);
        job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath);
        job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1));
        job.setNumReduceTasks(1);

        setJobClasspath(job, cube.getConfig());

        // dump metadata to HDFS
        attachSegmentsMetadataWithDict(mergingSeg, metaUrl);

        // clean output dir
        HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

        job.setMapperClass(MergeDictionaryMapper.class);
        job.setReducerClass(MergeDictionaryReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(IndexArrInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE);
        SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

        logger.info("Starting: " + job.getJobName());

        return waitForCompletion(job);

    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 8
Source File: SparkExecutableLivy.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("checkstyle:methodlength")
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    ExecutableManager mgr = getManager();
    Map<String, String> extra = mgr.getOutput(getId()).getExtra();
    String sparkJobId = extra.get(ExecutableConstants.SPARK_JOB_ID);
    if (!StringUtils.isEmpty(sparkJobId)) {
        return onResumed(sparkJobId, mgr);
    } else {
        String cubeName = this.getParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt());
        CubeInstance cube = CubeManager.getInstance(context.getConfig()).getCube(cubeName);
        final KylinConfig config = cube.getConfig();

        setAlgorithmLayer();

        LivyRestBuilder livyRestBuilder = new LivyRestBuilder();

        String segmentID = this.getParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt());
        CubeSegment segment = cube.getSegmentById(segmentID);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);
        dumpMetadata(segment, mergingSeg);

        Map<String, String> sparkConfs = config.getSparkConfigOverride();
        String sparkConfigName = getSparkConfigName();
        if (sparkConfigName != null) {
            Map<String, String> sparkSpecificConfs = config.getSparkConfigOverrideWithSpecificName(sparkConfigName);
            sparkConfs.putAll(sparkSpecificConfs);
        }

        for (Map.Entry<String, String> entry : sparkConfs.entrySet()) {
            if (entry.getKey().equals("spark.submit.deployMode") || entry.getKey().equals("spark.master")
                    || entry.getKey().equals("spark.yarn.archive")) {
                continue;
            } else {
                livyRestBuilder.addConf(entry.getKey(), entry.getValue());
            }
        }
        formatArgs(livyRestBuilder.getArgs());

        final LivyRestExecutor executor = new LivyRestExecutor();
        final PatternedLogger patternedLogger = new PatternedLogger(logger, (infoKey, info) -> {
            // only care three properties here
            if (ExecutableConstants.SPARK_JOB_ID.equals(infoKey) || ExecutableConstants.YARN_APP_ID.equals(infoKey)
                    || ExecutableConstants.YARN_APP_URL.equals(infoKey)) {
                getManager().addJobInfo(getId(), info);
            }
        });

        try {
            livyRestBuilder.setLivyTypeEnum(LivyTypeEnum.job);
            executor.execute(livyRestBuilder, patternedLogger);
            if (isDiscarded()) {
                return new ExecuteResult(ExecuteResult.State.DISCARDED, "Discarded");
            }
            if (isPaused()) {
                return new ExecuteResult(ExecuteResult.State.STOPPED, "Stopped");
            }
            // done, update all properties
            Map<String, String> joblogInfo = patternedLogger.getInfo();
            // read counter from hdfs
            String counterOutput = getParam(BatchConstants.ARG_COUNTER_OUTPUT);
            if (counterOutput != null) {
                if (HadoopUtil.getWorkingFileSystem().exists(new Path(counterOutput))) {
                    Map<String, String> counterMap = HadoopUtil.readFromSequenceFile(counterOutput);
                    joblogInfo.putAll(counterMap);
                } else {
                    logger.warn("Spark counter output path not exists: " + counterOutput);
                }
            }
            readCounters(joblogInfo);
            getManager().addJobInfo(getId(), joblogInfo);
            return new ExecuteResult(ExecuteResult.State.SUCCEED, patternedLogger.getBufferedLog());

        } catch (Exception e) {
            logger.error("error run spark job:", e);
            // clear SPARK_JOB_ID on job failure.
            extra = mgr.getOutput(getId()).getExtra();
            extra.put(ExecutableConstants.SPARK_JOB_ID, "");
            getManager().addJobInfo(getId(), extra);
            return new ExecuteResult(ExecuteResult.State.ERROR, e.getMessage());
        }
    }
}
 
Example 9
Source File: MergeCuboidJob.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        String input = getOptionValue(OPTION_INPUT_PATH);
        String output = getOptionValue(OPTION_OUTPUT_PATH);
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment cubeSeg = cube.getSegmentById(segmentID);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        logger.info("Starting: " + jobName);
        job = Job.getInstance(getConf(), jobName);

        setJobClasspath(job, cube.getConfig());

        // add metadata to distributed cache
        Segments<CubeSegment> allSegs = cube.getMergingSegments(cubeSeg);
        allSegs.add(cubeSeg);
        attachSegmentsMetadataWithDict(allSegs, job.getConfiguration());

        // Mapper
        job.setMapperClass(MergeCuboidMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Reducer
        job.setReducerClass(CuboidReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //set dfs.replication
        job.getConfiguration().set("dfs.replication", KylinConfig.getInstanceFromEnv().getCuboidDfsReplication());

        // set inputs
        IMROutput2.IMRMergeOutputFormat outputFormat = MRUtil.getBatchMergeOutputSide2(cubeSeg).getOutputFormat();
        outputFormat.configureJobInput(job, input);
        addInputDirs(input, job);

        // set output
        outputFormat.configureJobOutput(job, output, cubeSeg);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 10
Source File: MergeDictionaryJob.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    try {
        Options options = new Options();
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_META_URL);
        options.addOption(OPTION_MERGE_SEGMENT_IDS);
        options.addOption(OPTION_OUTPUT_PATH_DICT);
        options.addOption(OPTION_OUTPUT_PATH_STAT);
        parseOptions(options, args);

        final String segmentId = getOptionValue(OPTION_SEGMENT_ID);
        final String segmentIds = getOptionValue(OPTION_MERGE_SEGMENT_IDS);
        final String cubeName = getOptionValue(OPTION_CUBE_NAME);
        final String metaUrl = getOptionValue(OPTION_META_URL);
        final String dictOutputPath = getOptionValue(OPTION_OUTPUT_PATH_DICT);
        final String statOutputPath = getOptionValue(OPTION_OUTPUT_PATH_STAT);

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeDesc cubeDesc = cube.getDescriptor();
        CubeSegment segment = cube.getSegmentById(segmentId);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        job.getConfiguration().set(BatchConstants.ARG_CUBE_NAME, cubeName);
        job.getConfiguration().set(OPTION_META_URL.getOpt(), metaUrl);
        job.getConfiguration().set(OPTION_SEGMENT_ID.getOpt(), segmentId);
        job.getConfiguration().set(OPTION_MERGE_SEGMENT_IDS.getOpt(), segmentIds);
        job.getConfiguration().set(OPTION_OUTPUT_PATH_STAT.getOpt(), statOutputPath);
        job.getConfiguration().set("num.map.tasks", String.valueOf(cubeDesc.getAllColumnsNeedDictionaryBuilt().size() + 1));
        job.setNumReduceTasks(1);

        setJobClasspath(job, cube.getConfig());

        // dump metadata to HDFS
        attachSegmentsMetadataWithDict(mergingSeg, metaUrl);

        // clean output dir
        HadoopUtil.deletePath(job.getConfiguration(), new Path(dictOutputPath));

        job.setMapperClass(MergeDictionaryMapper.class);
        job.setReducerClass(MergeDictionaryReducer.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(IndexArrInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.NONE);
        SequenceFileOutputFormat.setOutputPath(job, new Path(dictOutputPath));

        logger.info("Starting: " + job.getJobName());

        return waitForCompletion(job);

    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 11
Source File: SparkExecutableLivy.java    From kylin with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("checkstyle:methodlength")
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    ExecutableManager mgr = getManager();
    Map<String, String> extra = mgr.getOutput(getId()).getExtra();
    String sparkJobId = extra.get(ExecutableConstants.SPARK_JOB_ID);
    if (!StringUtils.isEmpty(sparkJobId)) {
        return onResumed(sparkJobId, mgr);
    } else {
        String cubeName = this.getParam(SparkCubingByLayer.OPTION_CUBE_NAME.getOpt());
        CubeInstance cube = CubeManager.getInstance(context.getConfig()).getCube(cubeName);
        final KylinConfig config = cube.getConfig();

        setAlgorithmLayer();

        LivyRestBuilder livyRestBuilder = new LivyRestBuilder();

        String segmentID = this.getParam(SparkCubingByLayer.OPTION_SEGMENT_ID.getOpt());
        CubeSegment segment = cube.getSegmentById(segmentID);
        Segments<CubeSegment> mergingSeg = cube.getMergingSegments(segment);
        dumpMetadata(segment, mergingSeg);

        Map<String, String> sparkConfs = config.getSparkConfigOverride();
        String sparkConfigName = getSparkConfigName();
        if (sparkConfigName != null) {
            Map<String, String> sparkSpecificConfs = config.getSparkConfigOverrideWithSpecificName(sparkConfigName);
            sparkConfs.putAll(sparkSpecificConfs);
        }

        for (Map.Entry<String, String> entry : sparkConfs.entrySet()) {
            if (entry.getKey().equals("spark.submit.deployMode") || entry.getKey().equals("spark.master")
                    || entry.getKey().equals("spark.yarn.archive")) {
                continue;
            } else {
                livyRestBuilder.addConf(entry.getKey(), entry.getValue());
            }
        }
        formatArgs(livyRestBuilder.getArgs());

        final LivyRestExecutor executor = new LivyRestExecutor();
        final PatternedLogger patternedLogger = new PatternedLogger(logger, (infoKey, info) -> {
            // only care three properties here
            if (ExecutableConstants.SPARK_JOB_ID.equals(infoKey) || ExecutableConstants.YARN_APP_ID.equals(infoKey)
                    || ExecutableConstants.YARN_APP_URL.equals(infoKey)) {
                getManager().addJobInfo(getId(), info);
            }
        });

        try {
            livyRestBuilder.setLivyTypeEnum(LivyTypeEnum.job);
            executor.execute(livyRestBuilder, patternedLogger);
            if (isDiscarded()) {
                return new ExecuteResult(ExecuteResult.State.DISCARDED, "Discarded");
            }
            if (isPaused()) {
                return new ExecuteResult(ExecuteResult.State.STOPPED, "Stopped");
            }
            // done, update all properties
            Map<String, String> joblogInfo = patternedLogger.getInfo();
            // read counter from hdfs
            String counterOutput = getParam(BatchConstants.ARG_COUNTER_OUTPUT);
            if (counterOutput != null) {
                if (HadoopUtil.getWorkingFileSystem().exists(new Path(counterOutput))) {
                    Map<String, String> counterMap = HadoopUtil.readFromSequenceFile(counterOutput);
                    joblogInfo.putAll(counterMap);
                } else {
                    logger.warn("Spark counter output path not exists: " + counterOutput);
                }
            }
            readCounters(joblogInfo);
            getManager().addJobInfo(getId(), joblogInfo);
            return new ExecuteResult(ExecuteResult.State.SUCCEED, patternedLogger.getBufferedLog());

        } catch (Exception e) {
            logger.error("error run spark job:", e);
            // clear SPARK_JOB_ID on job failure.
            extra = mgr.getOutput(getId()).getExtra();
            extra.put(ExecutableConstants.SPARK_JOB_ID, "");
            getManager().addJobInfo(getId(), extra);
            return new ExecuteResult(ExecuteResult.State.ERROR, e.getMessage());
        }
    }
}