Java Code Examples for org.apache.kylin.common.KylinConfig#getMrHiveDictColumnsExcludeRefColumns()

The following examples show how to use org.apache.kylin.common.KylinConfig#getMrHiveDictColumnsExcludeRefColumns() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HiveInputBase.java    From kylin with Apache License 2.0 6 votes vote down vote up
@Override
public void addStepPhase_ReplaceFlatTableGlobalColumnValue(DefaultChainedExecutable jobFlow) {
    KylinConfig dictConfig = flatDesc.getSegment().getConfig();
    final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams());
    String globalDictTable = MRHiveDictUtil.globalDictTableName(flatDesc, cubeName);
    String globalDictDatabase = dictConfig.getMrHiveDictDB();

    String[] mrHiveDictColumnsExcludeRefCols = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    Map<String, String> dictRef = dictConfig.getMrHiveDictRefColumns();
    final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase);

    if (Objects.nonNull(mrHiveDictColumnsExcludeRefCols) && mrHiveDictColumnsExcludeRefCols.length > 0) {
        jobFlow.addTask(createHiveGlobalDictMergeGlobalDict(flatDesc, hiveInitStatements, cubeName, mrHiveDictColumnsExcludeRefCols, globalDictDatabase, globalDictTable));
        for (String item : mrHiveDictColumnsExcludeRefCols) {
            dictRef.put(item, "");
        }
    }

    // replace step
    if (!dictRef.isEmpty()) {
        jobFlow.addTask(createMrHiveGlobalDictReplaceStep(flatDesc, hiveInitStatements, cubeName,
                dictRef, flatTableDatabase, globalDictDatabase, globalDictTable, dictConfig.getMrHiveDictTableSuffix(), jobFlow.getId()));
    }
}
 
Example 2
Source File: BatchCubingJobBuilder2.java    From kylin with Apache License 2.0 6 votes vote down vote up
/**
 * Build hive global dictionary by MR and encode corresponding column into integer for flat table
 */
protected void buildHiveGlobalDictionaryByMR(final CubingJob result, String jobId) {
    KylinConfig dictConfig = seg.getConfig();
    String[] mrHiveDictColumnExcludeRef = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns();

    if (Objects.nonNull(mrHiveDictColumnExcludeRef) && mrHiveDictColumnExcludeRef.length > 0
            && !"".equals(mrHiveDictColumnExcludeRef[0])) {

        // 1. parallel part build
        result.addTask(createBuildGlobalHiveDictPartBuildJob(jobId));

        // 2. parallel total build
        result.addTask(createBuildGlobalHiveDictTotalBuildJob(jobId));
    }

    // Merge new dictionary entry into global dictionary and replace/encode flat table
    if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0 && !"".equals(mrHiveDictColumns[0])) {
        inputSide.addStepPhase_ReplaceFlatTableGlobalColumnValue(result);
    }
}
 
Example 3
Source File: SparkBatchCubingJobBuilder2.java    From kylin with Apache License 2.0 6 votes vote down vote up
/**
 * Build hive global dictionary by MR and encode corresponding column into integer for flat table
 */
protected void buildHiveGlobalDictionaryByMR(final CubingJob result, String jobId) {
    KylinConfig dictConfig = seg.getConfig();
    String[] mrHiveDictColumnExcludeRef = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumns();

    if (Objects.nonNull(mrHiveDictColumnExcludeRef) && mrHiveDictColumnExcludeRef.length > 0
            && !"".equals(mrHiveDictColumnExcludeRef[0])) {

        // 1. parallel part build
        result.addTask(createBuildGlobalHiveDictPartBuildJob(jobId));

        // 2. parallel total build
        result.addTask(createBuildGlobalHiveDictTotalBuildJob(jobId));
    }

    // merge global dic and replace flat table
    if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0 && !"".equals(mrHiveDictColumns[0])) {
        inputSide.addStepPhase_ReplaceFlatTableGlobalColumnValue(result);
    }
}
 
Example 4
Source File: HiveInputBase.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
public void addStepPhase1_CreateFlatTable(DefaultChainedExecutable jobFlow) {
    final String cubeName = CubingExecutableUtil.getCubeName(jobFlow.getParams());
    CubeInstance cubeInstance = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()).getCube(cubeName);
    final KylinConfig cubeConfig = cubeInstance.getConfig();

    final String hiveInitStatements = JoinedFlatTable.generateHiveInitStatements(flatTableDatabase);

    // create flat table first
    addStepPhase1_DoCreateFlatTable(jobFlow);

    // create hive global dictionary
    KylinConfig dictConfig = flatDesc.getSegment().getConfig();
    String[] mrHiveDictColumns = dictConfig.getMrHiveDictColumnsExcludeRefColumns();
    if (Objects.nonNull(mrHiveDictColumns) && mrHiveDictColumns.length > 0
            && !"".equals(mrHiveDictColumns[0])) {
        addStepPhase1_DoCreateMrHiveGlobalDict(jobFlow, mrHiveDictColumns);
    }

    // then count and redistribute
    if (cubeConfig.isHiveRedistributeEnabled()) {
        final KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv();
        //jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc, cubeInstance.getDescriptor()));
        if (kylinConfig.isLivyEnabled() && cubeInstance.getEngineType() == IEngineAware.ID_SPARK) {
            jobFlow.addTask(createRedistributeFlatHiveTableByLivyStep(hiveInitStatements, cubeName, flatDesc,
                    cubeInstance.getDescriptor()));
        } else {
            jobFlow.addTask(createRedistributeFlatHiveTableStep(hiveInitStatements, cubeName, flatDesc,
                    cubeInstance.getDescriptor()));
        }
    }

    // special for hive
    addStepPhase1_DoMaterializeLookupTable(jobFlow);
}
 
Example 5
Source File: BuildGlobalHiveDictPartBuildReducer.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    mos = new MultipleOutputs(context);
    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    dicCols = config.getMrHiveDictColumnsExcludeRefColumns();
}
 
Example 6
Source File: BuildGlobalHiveDictPartBuildMapper.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    tmpbuf = ByteBuffer.allocate(64);

    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    String[] dicCols = config.getMrHiveDictColumnsExcludeRefColumns();
    logger.info("kylin.dictionary.mr-hive.columns: exclude ref cols {}", dicCols);

    //eg: /user/kylin/warehouse/db/kylin_intermediate_kylin_sales_cube_mr_6222c210_ce2d_e8ce_dd0f_f12c38fa9115__group_by/dict_column=KYLIN_SALES_SELLER_ID/part-000
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    //eg: dict_column=KYLIN_SALES_SELLER_ID
    String name = fileSplit.getPath().getParent().getName();
    logger.info("this map file name :{}", name);

    //eg: KYLIN_SALES_SELLER_ID
    String colName = name.split("=")[1];
    logger.info("this map build col name :{}", colName);

    for (int i = 0; i < dicCols.length; i++) {
        if (dicCols[i].equalsIgnoreCase(colName)) {
            colIndex = i;
        }
    }
    if (colIndex < 0 || colIndex > 127) {
        logger.error("kylin.dictionary.mr-hive.columns colIndex :{} error ", colIndex);
        logger.error("kylin.dictionary.mr-hive.columns set error,mr-hive columns's count should less than 128");
    }
    logger.info("this map build col index :{}", colIndex);

}
 
Example 7
Source File: BuildGlobalHiveDictTotalBuildJob.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    String[] dicColsArr = null;
    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_GLOBAL_DIC_MAX_DISTINCT_COUNT);
        options.addOption(OPTION_GLOBAL_DIC_PART_REDUCE_STATS);
        parseOptions(options, args);

        KylinConfig config = KylinConfig.getInstanceFromEnv();
        dicColsArr = config.getMrHiveDictColumnsExcludeRefColumns();
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        logger.info("Starting: " + job.getJobName());

        // ----------------------------------------------------------------------------
        // add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegmentById(segmentID);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
        job.getConfiguration().set("partition.statistics.path", getOptionValue(OPTION_GLOBAL_DIC_PART_REDUCE_STATS));
        job.getConfiguration().set("last.max.dic.value.path", getOptionValue(OPTION_GLOBAL_DIC_MAX_DISTINCT_COUNT));
        job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", false);

        job.setJarByClass(BuildGlobalHiveDictTotalBuildJob.class);

        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(BuildGlobalHiveDictTotalBuildMapper.class);

        // Input Output
        setInput(job, getOptionValue(OPTION_INPUT_PATH));
        setOutput(job, dicColsArr, getOptionValue(OPTION_OUTPUT_PATH));

        job.setNumReduceTasks(0);//no reduce

        job.setInputFormatClass(KeyValueTextInputFormat.class);

        // prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

        // delete output
        Path baseOutputPath = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        deletePath(job.getConfiguration(), baseOutputPath);

        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 8
Source File: BuildGlobalHiveDictPartBuildJob.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    String[] dicColsArr = null;

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        parseOptions(options, args);

        KylinConfig config = KylinConfig.getInstanceFromEnv();
        dicColsArr = config.getMrHiveDictColumnsExcludeRefColumns();

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));

        // add metadata to distributed cache
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegmentById(segmentID);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        logger.info("Starting: " + job.getJobName());

        job.setJarByClass(BuildGlobalHiveDictPartBuildJob.class);

        setJobClasspath(job, cube.getConfig());

        //FileInputFormat.setInputPaths(job, input);
        setInput(job, dicColsArr, getInputPath(config, segment));

        // make each reducer output to respective dir
        setOutput(job, dicColsArr, getOptionValue(OPTION_OUTPUT_PATH));
        job.getConfiguration().setBoolean("mapreduce.output.fileoutputformat.compress", false);

        //set reduce num
        setReduceNum(job, config);

        job.setInputFormatClass(KeyValueTextInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapperClass(BuildGlobalHiveDictPartBuildMapper.class);
        job.setPartitionerClass(BuildGlobalHiveDictPartPartitioner.class);
        job.setReducerClass(BuildGlobalHiveDictPartBuildReducer.class);

        // prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

        // delete output
        Path baseOutputPath = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        deletePath(job.getConfiguration(), baseOutputPath);

        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 9
Source File: BuildGlobalHiveDictTotalBuildMapper.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config;
    try {
        config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    cols = config.getMrHiveDictColumnsExcludeRefColumns();


    String statPath = conf.get("partition.statistics.path");

    // get the input file name ,the file name format by colIndex-part-partitionNum, eg: 1-part-000019
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String[] arr = fileSplit.getPath().getName().split("-");
    int partitionNum = Integer.parseInt(arr[2]);
    colIndex = Integer.parseInt(arr[0]);
    colName = cols[colIndex];
    logger.info("Input fileName:{}, colIndex:{}, colName:{}, partitionNum:{}", fileSplit.getPath().getName(), colIndex, colName, partitionNum);

    //last max dic value per column
    String lastMaxValuePath = conf.get("last.max.dic.value.path");
    logger.info("last.max.dic.value.path:" + lastMaxValuePath);
    long lastMaxDictValue = this.getLastMaxDicValue(conf, lastMaxValuePath);
    logger.info("last.max.dic.value.path:" + lastMaxValuePath + ",value=" + lastMaxDictValue);

    // Calculate the starting position of this file, the starting position of this file = sum (count) of all previous numbers + last max dic value of the column
    Map<Integer, TreeMap<Integer, Long>> allStats = getPartitionsCount(conf, statPath); //<colIndex,<reduceNum,count>>
    TreeMap<Integer, Long> partitionStats = allStats.get(colIndex);
    if (partitionNum != 0) {
        SortedMap<Integer, Long> subStat = partitionStats.subMap(0, true, partitionNum, false);
        subStat.forEach((k, v) -> {
            logger.info("Split num:{} and it's count:{}", k, v);
            start += v;
        });
    }
    start += lastMaxDictValue;
    logger.info("global dic.{}.split.num.{} build dict start offset is {}", colName, partitionNum, start);
}