Java Code Examples for org.apache.kylin.metadata.model.TblColRef#getIdentity()

The following examples show how to use org.apache.kylin.metadata.model.TblColRef#getIdentity() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkFactDistinct.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void outputDict(TblColRef col, Dictionary<String> dict,
        List<Tuple2<String, Tuple3<Writable, Writable, String>>> result)
        throws IOException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos)) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        result.add(new Tuple2<String, Tuple3<Writable, Writable, String>>(BatchConstants.CFG_OUTPUT_DICT,
                new Tuple3<Writable, Writable, String>(NullWritable.get(),
                        new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName)));
    }
}
 
Example 2
Source File: SparkFactDistinct.java    From kylin with Apache License 2.0 6 votes vote down vote up
private void outputDict(TblColRef col, Dictionary<String> dict,
        List<Tuple2<String, Tuple3<Writable, Writable, String>>> result)
        throws IOException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos)) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        result.add(new Tuple2<String, Tuple3<Writable, Writable, String>>(BatchConstants.CFG_OUTPUT_DICT,
                new Tuple3<Writable, Writable, String>(NullWritable.get(),
                        new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName)));
    }
}
 
Example 3
Source File: AggregationGroup.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void normalizeColumnNames(String[] names) {
    if (names == null)
        return;

    for (int i = 0; i < names.length; i++) {
        TblColRef col = cubeDesc.getModel().findColumn(names[i]);
        names[i] = col.getIdentity();
    }

    // check no dup
    Set<String> set = new HashSet<>(Arrays.asList(names));
    if (set.size() < names.length)
        throw new IllegalStateException(
                "Columns in aggrgroup must not contain duplication: " + Arrays.asList(names));
}
 
Example 4
Source File: FactDistinctColumnsReducer.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DataOutputStream outputStream = new DataOutputStream(baos);) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(),
                new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
    }
}
 
Example 5
Source File: DictionaryGetterUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public static Map<TblColRef, Dictionary<String>> getDictionaryMap(CubeSegment cubeSegment, InputSplit inputSplit,
                                                                  Configuration configuration) throws IOException {
    Map<TblColRef, Dictionary<String>> dictionaryMap = cubeSegment.buildDictionaryMap();

    String shrunkenDictPath = configuration.get(BatchConstants.ARG_SHRUNKEN_DICT_PATH);
    if (shrunkenDictPath == null) {
        return dictionaryMap;
    }

    // replace global dictionary with shrunken dictionary if possible
    String inputSplitSignature = getInputSplitSignature(cubeSegment, inputSplit);
    FileSystem fs = FileSystem.get(configuration);
    ShrunkenDictionary.StringValueSerializer valueSerializer = new ShrunkenDictionary.StringValueSerializer();
    for (TblColRef colRef : cubeSegment.getCubeDesc().getAllGlobalDictColumns()) {
        Path colShrunkenDictDir = new Path(shrunkenDictPath, colRef.getIdentity());
        Path colShrunkenDictPath = new Path(colShrunkenDictDir, inputSplitSignature);
        if (!fs.exists(colShrunkenDictPath)) {
            logger.warn("Shrunken dictionary for column " + colRef.getIdentity() + " in split "
                    + inputSplitSignature + " does not exist!!!");
            continue;
        }
        try (DataInputStream dis = fs.open(colShrunkenDictPath)) {
            Dictionary<String> shrunkenDict = new ShrunkenDictionary(valueSerializer);
            shrunkenDict.readFields(dis);

            dictionaryMap.put(colRef, shrunkenDict);
        }
    }

    return dictionaryMap;
}
 
Example 6
Source File: SparkBuildDictionary.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<String, Tuple3<String, Integer, Integer>> call(TblColRef tblColRef) throws Exception {
    if (initialized == false) {
        synchronized (SparkBuildDictionary.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    logger.info("Building dictionary for column {}", tblColRef);
    IReadableTable inpTable = getDistinctValuesFor(tblColRef);
    Dictionary<String> preBuiltDict;
    DictionaryInfo dictInfo;
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(config)) {
        preBuiltDict = getDictionary(tblColRef);

        if (preBuiltDict != null) {
            logger.info("Dict for '{}' has already been built, save it", tblColRef.getName());
            dictInfo = dictManager.saveDictionary(tblColRef, inpTable, preBuiltDict);
        } else {
            logger.info("Dict for '{}' not pre-built, build it from {}", tblColRef.getName(), inpTable);
            String builderClass = cubeSegment.getCubeDesc().getDictionaryBuilderClass(tblColRef);
            dictInfo = dictManager.buildDictionary(tblColRef, inpTable, builderClass);
            preBuiltDict = dictInfo.getDictionaryObject();
        }
    }

    return new Tuple2<>(tblColRef.getIdentity(),
            new Tuple3<>(dictInfo.getResourcePath(), preBuiltDict.getSize(), preBuiltDict.getSizeOfId()));
}
 
Example 7
Source File: AggregationGroup.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private void normalizeColumnNames(String[] names) {
    if (names == null)
        return;

    for (int i = 0; i < names.length; i++) {
        TblColRef col = cubeDesc.getModel().findColumn(names[i]);
        names[i] = col.getIdentity();
    }

    // check no dup
    Set<String> set = new HashSet<>(Arrays.asList(names));
    if (set.size() < names.length)
        throw new IllegalStateException(
                "Columns in aggrgroup must not contain duplication: " + Arrays.asList(names));
}
 
Example 8
Source File: CubeSegment.java    From kylin with Apache License 2.0 5 votes vote down vote up
public String getDictResPath(TblColRef col) {
    String r;
    String dictKey = col.getIdentity();
    r = getDictionaries().get(dictKey);

    // try Kylin v1.x dict key as well
    if (r == null) {
        String v1DictKey = col.getTable() + "/" + col.getName();
        r = getDictionaries().get(v1DictKey);
    }

    return r;
}
 
Example 9
Source File: SparkUHCDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception {
    if (initialized == false) {
        synchronized (SparkFactDistinct.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config);
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        TblColRef col = uhcColumns.get(columnValues._1);
        logger.info("Processing column " + col.getName());
        if (cube.getDescriptor().getShardByColumns().contains(col)) {
            //for ShardByColumns
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        } else {
            //for GlobalDictionaryColumns
            DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
            String builderClass = cubeDesc.getDictionaryBuilderClass(col);
            builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
            builder.init(dictionaryInfo, 0, hdfsDir);
        }
        Iterator<String> values = columnValues._2.iterator();
        while (values.hasNext()) {
            builder.addValue(values.next());
        }
        Dictionary<String> dict = builder.build();
        String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
        logger.info("Dictionary file name is " + dictFileName);

        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);
        Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
        return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3);
    }
}
 
Example 10
Source File: SparkBuildDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<String, Tuple3<String, Integer, Integer>> call(TblColRef tblColRef) throws Exception {
    if (initialized == false) {
        synchronized (SparkBuildDictionary.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    logger.info("Building dictionary for column {}", tblColRef);
    IReadableTable inpTable = getDistinctValuesFor(tblColRef);
    Dictionary<String> preBuiltDict;
    DictionaryInfo dictInfo;
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(config)) {
        preBuiltDict = getDictionary(tblColRef);

        if (preBuiltDict != null) {
            logger.info("Dict for '{}' has already been built, save it", tblColRef.getName());
            dictInfo = dictManager.saveDictionary(tblColRef, inpTable, preBuiltDict);
        } else {
            logger.info("Dict for '{}' not pre-built, build it from {}", tblColRef.getName(), inpTable);
            String builderClass = cubeSegment.getCubeDesc().getDictionaryBuilderClass(tblColRef);
            dictInfo = dictManager.buildDictionary(tblColRef, inpTable, builderClass);
            preBuiltDict = dictInfo.getDictionaryObject();
        }
    }

    return new Tuple2<>(tblColRef.getIdentity(),
            new Tuple3<>(dictInfo.getResourcePath(), preBuiltDict.getSize(), preBuiltDict.getSizeOfId()));
}
 
Example 11
Source File: DictionaryGetterUtil.java    From kylin with Apache License 2.0 5 votes vote down vote up
public static Map<TblColRef, Dictionary<String>> getDictionaryMap(CubeSegment cubeSegment, InputSplit inputSplit,
                                                                  Configuration configuration) throws IOException {
    Map<TblColRef, Dictionary<String>> dictionaryMap = cubeSegment.buildDictionaryMap();

    String shrunkenDictPath = configuration.get(BatchConstants.ARG_SHRUNKEN_DICT_PATH);
    if (shrunkenDictPath == null) {
        return dictionaryMap;
    }

    // replace global dictionary with shrunken dictionary if possible
    String inputSplitSignature = getInputSplitSignature(cubeSegment, inputSplit);
    FileSystem fs = FileSystem.get(configuration);
    ShrunkenDictionary.StringValueSerializer valueSerializer = new ShrunkenDictionary.StringValueSerializer();
    for (TblColRef colRef : cubeSegment.getCubeDesc().getAllGlobalDictColumns()) {
        Path colShrunkenDictDir = new Path(shrunkenDictPath, colRef.getIdentity());
        Path colShrunkenDictPath = new Path(colShrunkenDictDir, inputSplitSignature);
        if (!fs.exists(colShrunkenDictPath)) {
            logger.warn("Shrunken dictionary for column " + colRef.getIdentity() + " in split "
                    + inputSplitSignature + " does not exist!!!");
            continue;
        }
        try (DataInputStream dis = fs.open(colShrunkenDictPath)) {
            Dictionary<String> shrunkenDict = new ShrunkenDictionary(valueSerializer);
            shrunkenDict.readFields(dis);

            dictionaryMap.put(colRef, shrunkenDict);
        }
    }

    return dictionaryMap;
}
 
Example 12
Source File: UHCDictionaryReducer.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void outputDict(TblColRef col, Dictionary<String> dict) throws IOException, InterruptedException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream outputStream = new DataOutputStream(baos);) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        mos.write(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
    }
    mos.close();
}
 
Example 13
Source File: FactDistinctColumnsBase.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void outputDict(TblColRef col, Dictionary<String> dict, Visitor visitor) throws IOException {
    // output written to baseDir/colName/colName.rldict-r-00000 (etc)
    String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;

    try (ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);

        visitor.collect(BatchConstants.CFG_OUTPUT_DICT, NullWritable.get(),
                new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
    }
}
 
Example 14
Source File: CubeSegment.java    From kylin with Apache License 2.0 4 votes vote down vote up
public String removeDictResPath(TblColRef col) {
    String dictKey = col.getIdentity();
    return getDictionaries().remove(dictKey);
}
 
Example 15
Source File: UHCDictionaryJob.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_CUBING_JOB_ID);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_INPUT_PATH);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));

        //add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        attachCubeMetadata(cube, job.getConfiguration());

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();

        //Note! handle uhc columns is null.
        boolean hasUHCValue = false;
        for (TblColRef tblColRef : uhcColumns) {
            Path path = new Path(input.toString() + "/" + tblColRef.getIdentity());
            if (HadoopUtil.getFileSystem(path).exists(path)) {
                FileInputFormat.addInputPath(job, path);
                FileInputFormat.setInputPathFilter(job, UHCDictPathFilter.class);
                hasUHCValue = true;
            }
        }

        if (!hasUHCValue) {
            isSkipped = true;
            return 0;
        }

        setJobClasspath(job, cube.getConfig());
        setupMapper();
        setupReducer(output, reducerCount);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);
        job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory());
        job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false");

        //8G memory is enough for all global dict, because the input is sequential and we handle global dict slice by slice
        job.getConfiguration().set("mapreduce.reduce.memory.mb", "8500");
        job.getConfiguration().set("mapred.reduce.child.java.opts", "-Xmx8g");
        //Copying global dict to working dir in GlobalDictHDFSStore maybe elapsed a long time (Maybe we could improve it)
        //Waiting the global dict lock maybe also take a long time.
        //So we set 8 hours here
        job.getConfiguration().set("mapreduce.task.timeout", "28800000");

        //allow user specially set config for uhc step
        for (Map.Entry<String, String> entry : cube.getConfig().getUHCMRConfigOverride().entrySet()) {
            job.getConfiguration().set(entry.getKey(), entry.getValue());
        }

        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}
 
Example 16
Source File: SparkUHCDictionary.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}
 
Example 17
Source File: SparkBuildDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
public IReadableTable getDistinctValuesFor(TblColRef col) {
    return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType());
}
 
Example 18
Source File: SparkUHCDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}
 
Example 19
Source File: CubeSegment.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
public void putDictResPath(TblColRef col, String dictResPath) {
    String dictKey = col.getIdentity();
    getDictionaries().put(dictKey, dictResPath);
}
 
Example 20
Source File: SparkBuildDictionary.java    From kylin with Apache License 2.0 4 votes vote down vote up
public IReadableTable getDistinctValuesFor(TblColRef col) {
    return new SortedColumnDFSFile(factColumnsInputPath + "/" + col.getIdentity(), col.getType());
}