Java Code Examples for org.apache.kylin.dict.DictionaryGenerator#buildDictionary()

The following examples show how to use org.apache.kylin.dict.DictionaryGenerator#buildDictionary() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: FragmentFilesMerger.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}

Example 2

Source File: ColumnarMemoryStorePersister.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = new Stopwatch();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsedMillis()
                + " ms ");
    }
    return dict;
}

Example 3

Source File: FragmentFilesMerger.java From kylin with Apache License 2.0

6 votes

private Map<TblColRef, Dictionary<String>> mergeAndPersistDictionaries(FragmentMetaInfo fragmentMetaInfo,
        Map<TblColRef, List<Dictionary<String>>> dimDictListMap, CountingOutputStream fragmentOut)
        throws IOException {
    logger.info("merge dimension dictionaries");
    Map<TblColRef, Dictionary<String>> mergedDictMap = Maps.newHashMap();
    List<DimDictionaryMetaInfo> dimDictionaryMetaInfos = Lists.newArrayList();
    for (TblColRef dimension : parsedCubeInfo.dimensionsUseDictEncoding) {
        List<Dictionary<String>> dicts = dimDictListMap.get(dimension);
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dimension.getType(), dicts);
        Dictionary<String> mergedDict = DictionaryGenerator.buildDictionary(dimension.getType(),
                multipleDictionaryValueEnumerator);
        mergedDictMap.put(dimension, mergedDict);

        DimDictionaryMetaInfo dimDictionaryMetaInfo = new DimDictionaryMetaInfo();
        dimDictionaryMetaInfo.setDimName(dimension.getName());
        dimDictionaryMetaInfo.setDictType(mergedDict.getClass().getName());
        dimDictionaryMetaInfo.setStartOffset((int) fragmentOut.getCount());

        DictionarySerializer.serialize(mergedDict, fragmentOut);
        dimDictionaryMetaInfo.setDictLength((int) fragmentOut.getCount() - dimDictionaryMetaInfo.getStartOffset());
        dimDictionaryMetaInfos.add(dimDictionaryMetaInfo);
    }
    fragmentMetaInfo.setDimDictionaryMetaInfos(dimDictionaryMetaInfos);
    return mergedDictMap;
}

Example 4

Source File: ColumnarMemoryStorePersister.java From kylin with Apache License 2.0

6 votes

private Dictionary<String> buildDictionary(TblColRef dim, List<Object> inputValues) throws IOException {
    Stopwatch stopwatch = Stopwatch.createUnstarted();
    stopwatch.start();
    final Collection<String> values = Collections2.transform(Sets.newHashSet(inputValues),
            new Function<Object, String>() {
                @Nullable
                @Override
                public String apply(Object input) {
                    String value = (String) input;
                    return value;
                }
            });
    final Dictionary<String> dict = DictionaryGenerator.buildDictionary(dim.getType(),
            new IterableDictionaryValueEnumerator(values));
    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("BuildDictionary for column : " + dim.getName() + " took : " + stopwatch.elapsed(MILLISECONDS)
                + " ms ");
    }
    return dict;
}

Example 5

Source File: CubingUtils.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}

Example 6

Source File: CubingUtils.java From kylin with Apache License 2.0

5 votes

public static Map<TblColRef, Dictionary<String>> buildDictionary(final CubeInstance cubeInstance,
        Iterable<List<String>> recordList) throws IOException {
    final List<TblColRef> columnsNeedToBuildDictionary = cubeInstance.getDescriptor()
            .listDimensionColumnsExcludingDerived(true);
    final HashMap<Integer, TblColRef> tblColRefMap = Maps.newHashMap();
    int index = 0;
    for (TblColRef column : columnsNeedToBuildDictionary) {
        tblColRefMap.put(index++, column);
    }

    HashMap<TblColRef, Dictionary<String>> result = Maps.newHashMap();

    HashMultimap<TblColRef, String> valueMap = HashMultimap.create();
    for (List<String> row : recordList) {
        for (int i = 0; i < row.size(); i++) {
            String cell = row.get(i);
            if (tblColRefMap.containsKey(i)) {
                valueMap.put(tblColRefMap.get(i), cell);
            }
        }
    }
    for (TblColRef tblColRef : valueMap.keySet()) {
        Set<String> values = valueMap.get(tblColRef);
        Dictionary<String> dict = DictionaryGenerator.buildDictionary(tblColRef.getType(),
                new IterableDictionaryValueEnumerator(values));
        result.put(tblColRef, dict);
    }
    return result;
}

Example 7

Source File: MergeDictReducer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}

Example 8

Source File: MergeDictReducer.java From kylin with Apache License 2.0

4 votes

@Override
protected void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    String col = key.toString();
    logger.info("merge dictionary for column:{}", col);
    TblColRef tblColRef = colNeedDictMap.get(col);

    if (tblColRef == null) {
        logger.warn("column:{} not found in the columns need dictionary map: {}", col, colNeedDictMap.keySet());
        return;
    }

    DataType dataType = tblColRef.getType();
    List<Dictionary<String>> dicts = Lists.newLinkedList();
    for (Text value : values) {
        ByteArray byteArray = new ByteArray(value.getBytes());
        Dictionary<String> dict = (Dictionary<String>) DictionarySerializer.deserialize(byteArray);
        dicts.add(dict);
    }
    Dictionary mergedDict;
    if (dicts.size() > 1) {
        MultipleDictionaryValueEnumerator multipleDictionaryValueEnumerator = new MultipleDictionaryValueEnumerator(
                dataType, dicts);
        mergedDict = DictionaryGenerator.buildDictionary(dataType, multipleDictionaryValueEnumerator);
    } else if (dicts.size() == 1) {
        mergedDict = dicts.get(0);
    } else {
        throw new IllegalArgumentException("Dictionary missing for column " + col);
    }
    if (mergedDict == null) {
        throw new IllegalArgumentException("Merge dictionaries error for column " + col);
    }

    TableDesc tableDesc = tblColRef.getColumnDesc().getTable();
    IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath(tableDesc.getResourcePath());

    //TODO: Table signature size?
    //        signature.setSize(mergedDict.getSize());

    DictionaryInfo dictionaryInfo = new DictionaryInfo(tblColRef.getTable(), tblColRef.getName(), tblColRef
            .getColumnDesc().getZeroBasedIndex(), tblColRef.getDatatype(), signature);
    dictionaryInfo.setDictionaryObject(mergedDict);
    dictionaryInfo.setDictionaryClass(mergedDict.getClass().getName());
    dictionaryInfo.setCardinality(mergedDict.getSize());

    ByteArrayOutputStream fulBuf = new ByteArrayOutputStream();
    DataOutputStream fulDout = new DataOutputStream(fulBuf);
    DictionaryInfoSerializer.FULL_SERIALIZER.serialize(dictionaryInfo, fulDout);

    Text outValue = new Text(fulBuf.toByteArray());
    context.write(key, outValue);
    logger.debug("output dict info of column {} to path: {}", col,
            context.getConfiguration().get(FileOutputFormat.OUTDIR));
}