org.apache.kylin.dict.DictionaryGenerator Java Exaples

Source File: MergeCuboidMapperTest.java From Kylin with Apache License 2.0

6 votes

private DictionaryInfo makeSharedDict() throws IOException {
    TableSignature signature = new TableSignature();
    signature.setSize(100);
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath("fake_common_dict");

    DictionaryInfo newDictInfo = new DictionaryInfo("", "", 0, "string", signature, "");

    List<byte[]> values = new ArrayList<byte[]>();
    values.add(new byte[] { 101, 101, 101 });
    values.add(new byte[] { 102, 102, 102 });
    Dictionary<?> dict = DictionaryGenerator.buildDictionaryFromValueList(newDictInfo, values);
    dictionaryManager.trySaveNewDict(dict, newDictInfo);
    ((TrieDictionary) dict).dump(System.out);

    return newDictInfo;
}

Source File: FragmentFilesMerger.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}

Source File: ColumnarMemoryStorePersister.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = new Stopwatch();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsedMillis()
                + " ms ");
    }
    return dict;
}

Source File: ColumnarMemoryStorePersister.java From kylin with Apache License 2.0

6 votes

private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = Stopwatch.createUnstarted();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsed(MILLISECONDS)
                + " ms ");
    }
    return dict;
}

Source File: FragmentFilesMerger.java From kylin with Apache License 2.0

6 votes

private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}

Source File: UHCDictionaryReducer.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}

Source File: CubingUtils.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}

Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

@Override
public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception {
    if (initialized == false) {
        synchronized (SparkFactDistinct.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config);
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        TblColRef col = uhcColumns.get(columnValues._1);
        logger.info("Processing column " + col.getName());
        if (cube.getDescriptor().getShardByColumns().contains(col)) {
            //for ShardByColumns
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        } else {
            //for GlobalDictionaryColumns
            DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
            String builderClass = cubeDesc.getDictionaryBuilderClass(col);
            builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
            builder.init(dictionaryInfo, 0, hdfsDir);
        }
        Iterator<String> values = columnValues._2.iterator();
        while (values.hasNext()) {
            builder.addValue(values.next());
        }
        Dictionary<String> dict = builder.build();
        String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
        logger.info("Dictionary file name is " + dictFileName);

        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);
        Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
        return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3);
    }
}

Source File: UHCDictionaryReducer.java From kylin with Apache License 2.0

5 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}

Source File: FactDistinctColumnsBase.java From kylin with Apache License 2.0

5 votes

public void setupReduce(int taskId) throws IOException {
    this.taskId = taskId;
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(envConfig)) {
        cube = CubeManager.getInstance(envConfig).getCube(cubeName);
        cubeDesc = cube.getDescriptor();
        reducerMapping = new FactDistinctColumnsReducerMapping(cube);
        logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();
            logger.info("Reducer " + taskId + " handling stats");
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            // local build dict
            buildDictInReducer = envConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }
            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }
            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
        }
    }
}

Source File: SparkUHCDictionary.java From kylin with Apache License 2.0

5 votes

@Override
public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception {
    if (initialized == false) {
        synchronized (SparkFactDistinct.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config);
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        TblColRef col = uhcColumns.get(columnValues._1);
        logger.info("Processing column " + col.getName());
        if (cube.getDescriptor().getShardByColumns().contains(col)) {
            //for ShardByColumns
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        } else {
            //for GlobalDictionaryColumns
            DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
            String builderClass = cubeDesc.getDictionaryBuilderClass(col);
            builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
            builder.init(dictionaryInfo, 0, hdfsDir);
        }
        Iterator<String> values = columnValues._2.iterator();
        while (values.hasNext()) {
            builder.addValue(values.next());
        }
        Dictionary<String> dict = builder.build();
        String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
        logger.info("Dictionary file name is " + dictFileName);

        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);
        Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
        return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3);
    }
}

Source File: CubingUtils.java From kylin with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}

Source File: MergeCuboidMapperTest.java From Kylin with Apache License 2.0

4 votes

@Before
public void setUp() throws Exception {

    createTestMetadata();

    logger.info("The metadataUrl is : " + getTestConfig());

    MetadataManager.clearCache();
    CubeManager.clearCache();
    ProjectManager.clearCache();
    DictionaryManager.clearCache();

    // hack for distributed cache
    // CubeManager.removeInstance(KylinConfig.createInstanceFromUri("../job/meta"));//to
    // make sure the following mapper could get latest CubeManger
    FileUtils.deleteDirectory(new File("../job/meta"));

    MergeCuboidMapper mapper = new MergeCuboidMapper();
    mapDriver = MapDriver.newMapDriver(mapper);

    cubeManager = CubeManager.getInstance(getTestConfig());
    cube = cubeManager.getCube("test_kylin_cube_without_slr_left_join_ready_2_segments");
    dictionaryManager = DictionaryManager.getInstance(getTestConfig());
    lfn = cube.getDescriptor().findColumnRef("DEFAULT.TEST_KYLIN_FACT", "LSTG_FORMAT_NAME");
    lsi = cube.getDescriptor().findColumnRef("DEFAULT.TEST_KYLIN_FACT", "CAL_DT");
    ssc = cube.getDescriptor().findColumnRef("DEFAULT.TEST_CATEGORY_GROUPINGS", "META_CATEG_NAME");

    DictionaryInfo sharedDict = makeSharedDict();

    boolean isFirstSegment = true;
    for (CubeSegment segment : cube.getSegments()) {

        TableSignature signature = new TableSignature();
        signature.setSize(100);
        signature.setLastModifiedTime(System.currentTimeMillis());
        signature.setPath("fake_dict_for" + lfn.getName() + segment.getName());

        DictionaryInfo newDictInfo = new DictionaryInfo(lfn.getTable(), lfn.getColumn().getName(), lfn.getColumn().getZeroBasedIndex(), "string", signature, "");

        List<byte[]> values = new ArrayList<byte[]>();
        values.add(new byte[] { 97, 97, 97 });
        if (isFirstSegment)
            values.add(new byte[] { 99, 99, 99 });
        else
            values.add(new byte[] { 98, 98, 98 });
        Dictionary<?> dict = DictionaryGenerator.buildDictionaryFromValueList(newDictInfo, values);
        dictionaryManager.trySaveNewDict(dict, newDictInfo);
        ((TrieDictionary) dict).dump(System.out);

        segment.putDictResPath(lfn, newDictInfo.getResourcePath());
        segment.putDictResPath(lsi, sharedDict.getResourcePath());
        segment.putDictResPath(ssc, sharedDict.getResourcePath());

        // cubeManager.saveResource(segment.getCubeInstance());
        // cubeManager.afterCubeUpdated(segment.getCubeInstance());
        cubeManager.updateCube(cube);

        isFirstSegment = false;
    }

}

Source File: SparkFactDistinct.java From kylin with Apache License 2.0

4 votes

private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}

Source File: MergeDictReducer.java From kylin with Apache License 2.0

4 votes

@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}

Source File: FactDistinctColumnsReducer.java From kylin with Apache License 2.0

4 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    cubeConfig = cube.getConfig();
    cubeDesc = cube.getDescriptor();

    taskId = context.getTaskAttemptID().getTaskID().getId();

    reducerMapping = new FactDistinctColumnsReducerMapping(cube);

    logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

    if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
        // hll
        isStatistics = true;
        baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
        baseCuboidRowCountInMappers = Lists.newArrayList();
        cuboidHLLMap = Maps.newHashMap();
        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
        logger.info("Reducer " + taskId + " handling stats");
    } else {
        // normal col
        col = reducerMapping.getColForReducer(taskId);
        Preconditions.checkNotNull(col);

        // local build dict
        buildDictInReducer = config.isBuildDictInReducerEnabled();
        if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
            buildDictInReducer = false;
        }
        if (reducerMapping.getReducerNumForDimCol(col) > 1) {
            buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
        }
        if (buildDictInReducer) {
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        }
        logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
    }
}

Source File: SparkFactDistinct.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}

Source File: MergeDictReducer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}

Source File: FactDistinctColumnsReducer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    cubeConfig = cube.getConfig();
    cubeDesc = cube.getDescriptor();

    taskId = context.getTaskAttemptID().getTaskID().getId();

    reducerMapping = new FactDistinctColumnsReducerMapping(cube);

    logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

    if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
        // hll
        isStatistics = true;
        baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
        baseCuboidRowCountInMappers = Lists.newArrayList();
        cuboidHLLMap = Maps.newHashMap();
        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
        logger.info("Reducer " + taskId + " handling stats");
    } else {
        // normal col
        col = reducerMapping.getColForReducer(taskId);
        Preconditions.checkNotNull(col);

        // local build dict
        buildDictInReducer = config.isBuildDictInReducerEnabled();
        if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
            buildDictInReducer = false;
        }
        if (reducerMapping.getReducerNumForDimCol(col) > 1) {
            buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
        }
        if (buildDictInReducer) {
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        }
        logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
    }
}

org.apache.kylin.dict.DictionaryGenerator Java Examples