org.apache.kylin.dict.DictionaryGenerator Java Examples

The following examples show how to use org.apache.kylin.dict.DictionaryGenerator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MergeCuboidMapperTest.java    From Kylin with Apache License 2.0 6 votes vote down vote up
private DictionaryInfo makeSharedDict() throws IOException {
    TableSignature signature = new TableSignature();
    signature.setSize(100);
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath("fake_common_dict");

    DictionaryInfo newDictInfo = new DictionaryInfo("", "", 0, "string", signature, "");

    List<byte[]> values = new ArrayList<byte[]>();
    values.add(new byte[] { 101, 101, 101 });
    values.add(new byte[] { 102, 102, 102 });
    Dictionary<?> dict = DictionaryGenerator.buildDictionaryFromValueList(newDictInfo, values);
    dictionaryManager.trySaveNewDict(dict, newDictInfo);
    ((TrieDictionary) dict).dump(System.out);

    return newDictInfo;
}
 
Example #2
Source File: FragmentFilesMerger.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}
 
Example #3
Source File: ColumnarMemoryStorePersister.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = new Stopwatch();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsedMillis()
                + " ms ");
    }
    return dict;
}
 
Example #4
Source File: ColumnarMemoryStorePersister.java    From kylin with Apache License 2.0 6 votes vote down vote up
private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = Stopwatch.createUnstarted();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsed(MILLISECONDS)
                + " ms ");
    }
    return dict;
}
 
Example #5
Source File: FragmentFilesMerger.java    From kylin with Apache License 2.0 6 votes vote down vote up
private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}
 
Example #6
Source File: UHCDictionaryReducer.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}
 
Example #7
Source File: CubingUtils.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}
 
Example #8
Source File: SparkUHCDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception {
    if (initialized == false) {
        synchronized (SparkFactDistinct.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config);
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        TblColRef col = uhcColumns.get(columnValues._1);
        logger.info("Processing column " + col.getName());
        if (cube.getDescriptor().getShardByColumns().contains(col)) {
            //for ShardByColumns
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        } else {
            //for GlobalDictionaryColumns
            DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
            String builderClass = cubeDesc.getDictionaryBuilderClass(col);
            builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
            builder.init(dictionaryInfo, 0, hdfsDir);
        }
        Iterator<String> values = columnValues._2.iterator();
        while (values.hasNext()) {
            builder.addValue(values.next());
        }
        Dictionary<String> dict = builder.build();
        String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
        logger.info("Dictionary file name is " + dictFileName);

        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);
        Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
        return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3);
    }
}
 
Example #9
Source File: UHCDictionaryReducer.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}
 
Example #10
Source File: FactDistinctColumnsBase.java    From kylin with Apache License 2.0 5 votes vote down vote up
public void setupReduce(int taskId) throws IOException {
    this.taskId = taskId;
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(envConfig)) {
        cube = CubeManager.getInstance(envConfig).getCube(cubeName);
        cubeDesc = cube.getDescriptor();
        reducerMapping = new FactDistinctColumnsReducerMapping(cube);
        logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();
            logger.info("Reducer " + taskId + " handling stats");
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            // local build dict
            buildDictInReducer = envConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }
            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }
            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
        }
    }
}
 
Example #11
Source File: SparkUHCDictionary.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception {
    if (initialized == false) {
        synchronized (SparkFactDistinct.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config);
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        TblColRef col = uhcColumns.get(columnValues._1);
        logger.info("Processing column " + col.getName());
        if (cube.getDescriptor().getShardByColumns().contains(col)) {
            //for ShardByColumns
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        } else {
            //for GlobalDictionaryColumns
            DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
            String builderClass = cubeDesc.getDictionaryBuilderClass(col);
            builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
            builder.init(dictionaryInfo, 0, hdfsDir);
        }
        Iterator<String> values = columnValues._2.iterator();
        while (values.hasNext()) {
            builder.addValue(values.next());
        }
        Dictionary<String> dict = builder.build();
        String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
        logger.info("Dictionary file name is " + dictFileName);

        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);
        Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
        return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3);
    }
}
 
Example #12
Source File: CubingUtils.java    From kylin with Apache License 2.0 5 votes vote down vote up
public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}
 
Example #13
Source File: MergeCuboidMapperTest.java    From Kylin with Apache License 2.0 4 votes vote down vote up
@Before
public void setUp() throws Exception {

    createTestMetadata();

    logger.info("The metadataUrl is : " + getTestConfig());

    MetadataManager.clearCache();
    CubeManager.clearCache();
    ProjectManager.clearCache();
    DictionaryManager.clearCache();

    // hack for distributed cache
    // CubeManager.removeInstance(KylinConfig.createInstanceFromUri("../job/meta"));//to
    // make sure the following mapper could get latest CubeManger
    FileUtils.deleteDirectory(new File("../job/meta"));

    MergeCuboidMapper mapper = new MergeCuboidMapper();
    mapDriver = MapDriver.newMapDriver(mapper);

    cubeManager = CubeManager.getInstance(getTestConfig());
    cube = cubeManager.getCube("test_kylin_cube_without_slr_left_join_ready_2_segments");
    dictionaryManager = DictionaryManager.getInstance(getTestConfig());
    lfn = cube.getDescriptor().findColumnRef("DEFAULT.TEST_KYLIN_FACT", "LSTG_FORMAT_NAME");
    lsi = cube.getDescriptor().findColumnRef("DEFAULT.TEST_KYLIN_FACT", "CAL_DT");
    ssc = cube.getDescriptor().findColumnRef("DEFAULT.TEST_CATEGORY_GROUPINGS", "META_CATEG_NAME");

    DictionaryInfo sharedDict = makeSharedDict();

    boolean isFirstSegment = true;
    for (CubeSegment segment : cube.getSegments()) {

        TableSignature signature = new TableSignature();
        signature.setSize(100);
        signature.setLastModifiedTime(System.currentTimeMillis());
        signature.setPath("fake_dict_for" + lfn.getName() + segment.getName());

        DictionaryInfo newDictInfo = new DictionaryInfo(lfn.getTable(), lfn.getColumn().getName(), lfn.getColumn().getZeroBasedIndex(), "string", signature, "");

        List<byte[]> values = new ArrayList<byte[]>();
        values.add(new byte[] { 97, 97, 97 });
        if (isFirstSegment)
            values.add(new byte[] { 99, 99, 99 });
        else
            values.add(new byte[] { 98, 98, 98 });
        Dictionary<?> dict = DictionaryGenerator.buildDictionaryFromValueList(newDictInfo, values);
        dictionaryManager.trySaveNewDict(dict, newDictInfo);
        ((TrieDictionary) dict).dump(System.out);

        segment.putDictResPath(lfn, newDictInfo.getResourcePath());
        segment.putDictResPath(lsi, sharedDict.getResourcePath());
        segment.putDictResPath(ssc, sharedDict.getResourcePath());

        // cubeManager.saveResource(segment.getCubeInstance());
        // cubeManager.afterCubeUpdated(segment.getCubeInstance());
        cubeManager.updateCube(cube);

        isFirstSegment = false;
    }

}
 
Example #14
Source File: SparkFactDistinct.java    From kylin with Apache License 2.0 4 votes vote down vote up
private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}
 
Example #15
Source File: MergeDictReducer.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}
 
Example #16
Source File: FactDistinctColumnsReducer.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    cubeConfig = cube.getConfig();
    cubeDesc = cube.getDescriptor();

    taskId = context.getTaskAttemptID().getTaskID().getId();

    reducerMapping = new FactDistinctColumnsReducerMapping(cube);

    logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

    if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
        // hll
        isStatistics = true;
        baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
        baseCuboidRowCountInMappers = Lists.newArrayList();
        cuboidHLLMap = Maps.newHashMap();
        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
        logger.info("Reducer " + taskId + " handling stats");
    } else {
        // normal col
        col = reducerMapping.getColForReducer(taskId);
        Preconditions.checkNotNull(col);

        // local build dict
        buildDictInReducer = config.isBuildDictInReducerEnabled();
        if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
            buildDictInReducer = false;
        }
        if (reducerMapping.getReducerNumForDimCol(col) > 1) {
            buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
        }
        if (buildDictInReducer) {
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        }
        logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
    }
}
 
Example #17
Source File: SparkFactDistinct.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
private void init() throws IOException {
    taskId = TaskContext.getPartitionId();
    kConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(conf, metaUrl);
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(kConfig)) {
        CubeInstance cubeInstance = CubeManager.getInstance(kConfig).getCube(cubeName);
        cubeDesc = cubeInstance.getDescriptor();
        cubeConfig = cubeInstance.getConfig();
        reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);

        result = Lists.newArrayList();

        if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
            // hll
            isStatistics = true;
            baseCuboidId = cubeInstance.getCuboidScheduler().getBaseCuboidId();
            baseCuboidRowCountInMappers = Lists.newArrayList();
            cuboidHLLMap = Maps.newHashMap();

            logger.info("Partition {} handling stats", taskId);
        } else {
            // normal col
            col = reducerMapping.getColForReducer(taskId);
            Preconditions.checkNotNull(col);

            isDimensionCol = cubeDesc.listDimensionColumnsExcludingDerived(true).contains(col) && col.getType().needCompare();
            isDictCol = cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col);

            // local build dict
            buildDictInReducer = kConfig.isBuildDictInReducerEnabled();
            if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
                buildDictInReducer = false;
            }

            if (reducerMapping.getReducerNumForDimCol(col) > 1) {
                buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
            }

            if (buildDictInReducer) {
                builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
                builder.init(null, 0, null);
            }
            logger.info("Partition {} handling column {}, buildDictInReducer={}", taskId, col, buildDictInReducer);
        }

        initialized = true;
    }
}
 
Example #18
Source File: MergeDictReducer.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}
 
Example #19
Source File: FactDistinctColumnsReducer.java    From kylin-on-parquet-v2 with Apache License 2.0 4 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    cubeConfig = cube.getConfig();
    cubeDesc = cube.getDescriptor();

    taskId = context.getTaskAttemptID().getTaskID().getId();

    reducerMapping = new FactDistinctColumnsReducerMapping(cube);

    logger.info("reducer no " + taskId + ", role play " + reducerMapping.getRolePlayOfReducer(taskId));

    if (reducerMapping.isCuboidRowCounterReducer(taskId)) {
        // hll
        isStatistics = true;
        baseCuboidId = cube.getCuboidScheduler().getBaseCuboidId();
        baseCuboidRowCountInMappers = Lists.newArrayList();
        cuboidHLLMap = Maps.newHashMap();
        samplingPercentage = Integer
                .parseInt(context.getConfiguration().get(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT));
        logger.info("Reducer " + taskId + " handling stats");
    } else {
        // normal col
        col = reducerMapping.getColForReducer(taskId);
        Preconditions.checkNotNull(col);

        // local build dict
        buildDictInReducer = config.isBuildDictInReducerEnabled();
        if (cubeDesc.getDictionaryBuilderClass(col) != null) { // only works with default dictionary builder
            buildDictInReducer = false;
        }
        if (reducerMapping.getReducerNumForDimCol(col) > 1) {
            buildDictInReducer = false; // only works if this is the only reducer of a dictionary column
        }
        if (buildDictInReducer) {
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        }
        logger.info("Reducer " + taskId + " handling column " + col + ", buildDictInReducer=" + buildDictInReducer);
    }
}