org.apache.kylin.dict.TrieDictionary Java Examples

The following examples show how to use org.apache.kylin.dict.TrieDictionary. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: MergeCuboidMapperTest.java From Kylin with Apache License 2.0

6 votes

private DictionaryInfo makeSharedDict() throws IOException {
    TableSignature signature = new TableSignature();
    signature.setSize(100);
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath("fake_common_dict");

    DictionaryInfo newDictInfo = new DictionaryInfo("", "", 0, "string", signature, "");

    List<byte[]> values = new ArrayList<byte[]>();
    values.add(new byte[] { 101, 101, 101 });
    values.add(new byte[] { 102, 102, 102 });
    Dictionary<?> dict = DictionaryGenerator.buildDictionaryFromValueList(newDictInfo, values);
    dictionaryManager.trySaveNewDict(dict, newDictInfo);
    ((TrieDictionary) dict).dump(System.out);

    return newDictInfo;
}

Example #2

Source File: AppendDictNode.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private int build_writeNode(AppendDictNode n, int offset, boolean isLastChild, int sizeChildOffset, int sizeId,
        byte[] trieBytes) {
    int o = offset;

    // childOffset
    if (isLastChild)
        trieBytes[o] |= TrieDictionary.BIT_IS_LAST_CHILD;
    if (n.isEndOfValue)
        trieBytes[o] |= TrieDictionary.BIT_IS_END_OF_VALUE;
    o += sizeChildOffset;

    // nValueBytes
    if (n.part.length > 255)
        throw new RuntimeException(
                "Value length is " + n.part.length + " and larger than 255: " + Bytes.toStringBinary(n.part));
    BytesUtil.writeUnsigned(n.part.length, trieBytes, o, 1);
    o++;

    // valueBytes
    System.arraycopy(n.part, 0, trieBytes, o, n.part.length);
    o += n.part.length;

    if (n.isEndOfValue) {
        checkValidId(n.id);
        BytesUtil.writeUnsigned(n.id, trieBytes, o, sizeId);
        o += sizeId;
    }

    return o;
}

Example #3

Source File: AppendDictNode.java From kylin with Apache License 2.0

5 votes

private int build_writeNode(AppendDictNode n, int offset, boolean isLastChild, int sizeChildOffset, int sizeId,
        byte[] trieBytes) {
    int o = offset;

    // childOffset
    if (isLastChild)
        trieBytes[o] |= TrieDictionary.BIT_IS_LAST_CHILD;
    if (n.isEndOfValue)
        trieBytes[o] |= TrieDictionary.BIT_IS_END_OF_VALUE;
    o += sizeChildOffset;

    // nValueBytes
    if (n.part.length > 255)
        throw new RuntimeException(
                "Value length is " + n.part.length + " and larger than 255: " + Bytes.toStringBinary(n.part));
    BytesUtil.writeUnsigned(n.part.length, trieBytes, o, 1);
    o++;

    // valueBytes
    System.arraycopy(n.part, 0, trieBytes, o, n.part.length);
    o += n.part.length;

    if (n.isEndOfValue) {
        checkValidId(n.id);
        BytesUtil.writeUnsigned(n.id, trieBytes, o, sizeId);
        o += sizeId;
    }

    return o;
}

Example #4

Source File: AppendDictNode.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

private void build_overwriteChildOffset(int parentOffset, int childOffset, int sizeChildOffset, byte[] trieBytes) {
    int flags = (int) trieBytes[parentOffset]
            & (TrieDictionary.BIT_IS_LAST_CHILD | TrieDictionary.BIT_IS_END_OF_VALUE);
    BytesUtil.writeUnsigned(childOffset, trieBytes, parentOffset, sizeChildOffset);
    trieBytes[parentOffset] |= flags;
}

Example #5

Source File: ColumnarMemoryStorePersister.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

/**
 * This method is used to persist the dimension data to disk file, first part is the dictionary, second part is the dimension value, third part is the index.
 *
 * @param dimValueList
 * @param dimensionMetaList
 * @param indexOut
 * @param dimension
 * @param dictMaps
 * @throws IOException
 */
private void persistDimension(long cuboidId, List<Object> dimValueList, List<DimensionMetaInfo> dimensionMetaList,
        CountingOutputStream indexOut, TblColRef dimension, Map<TblColRef, Dictionary<String>> dictMaps)
        throws IOException {
    Stopwatch stopwatch = new Stopwatch();
    stopwatch.start();

    DimensionMetaInfo dimensionMeta = new DimensionMetaInfo();
    dimensionMetaList.add(dimensionMeta);

    DimensionEncoding encoding;
    IIColumnDescriptor columnDescriptor;
    if (dimensionsUseDictEncoding.contains(dimension)) {
        Dictionary<String> dict = dictMaps.get(dimension);
        encoding = new DictionaryDimEnc(dict);
        if (dict instanceof TrieDictionary) {
            columnDescriptor = new SeqIIColumnDescriptor(dimension.getName(), dict.getMinId(), dict.getMaxId());
        } else {
            columnDescriptor = new FixLenIIColumnDescriptor(dimension.getName(), encoding.getLengthOfEncoding());
        }
    } else {
        RowKeyColDesc colDesc = cubeDesc.getRowkey().getColDesc(dimension);
        encoding = DimensionEncodingFactory.create(colDesc.getEncodingName(), colDesc.getEncodingArgs(),
                colDesc.getEncodingVersion());
        columnDescriptor = new FixLenIIColumnDescriptor(dimension.getName(), encoding.getLengthOfEncoding());
    }
    dimensionMeta.setName(dimension.getName());
    dimensionMeta.setStartOffset((int) indexOut.getCount());
    int fixEncodingLen = encoding.getLengthOfEncoding();

    DataOutputStream dataOut = new DataOutputStream(indexOut);
    ColumnarStoreDimDesc cStoreDimDesc = getColumnarStoreDimDesc(dimension, encoding);
    ColumnDataWriter columnDataWriter = cStoreDimDesc.getDimWriter(dataOut, dimValueList.size());

    //Raw values are stored on disk files with fixed length encoding to make it easy for inverted index to search and scan.
    for (Object cell : dimValueList) {
        byte[] fixLenBytes = new byte[fixEncodingLen];
        if (cell != null) {
            encoding.encode((String) cell, fixLenBytes, 0);
        } else {
            encoding.encode(null, fixLenBytes, 0);
            dimensionMeta.setHasNull(true);
        }
        columnDescriptor.getWriter().addValue(fixLenBytes);
        columnDataWriter.write(fixLenBytes);
    }
    columnDataWriter.flush();
    dimensionMeta.setDataLength(dataOut.size());
    columnDescriptor.getWriter().write(indexOut);
    dimensionMeta.setIndexLength((int) indexOut.getCount() - dimensionMeta.getStartOffset()
            - dimensionMeta.getDataLength());
    dimensionMeta.setCompression(cStoreDimDesc.getCompression().name());

    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("cuboid-{} saved dimension:{}, took: {}ms", cuboidId, dimension.getName(),
                stopwatch.elapsedMillis());
    }
}

Example #6

Source File: AppendDictNode.java From kylin with Apache License 2.0

4 votes

private void build_overwriteChildOffset(int parentOffset, int childOffset, int sizeChildOffset, byte[] trieBytes) {
    int flags = (int) trieBytes[parentOffset]
            & (TrieDictionary.BIT_IS_LAST_CHILD | TrieDictionary.BIT_IS_END_OF_VALUE);
    BytesUtil.writeUnsigned(childOffset, trieBytes, parentOffset, sizeChildOffset);
    trieBytes[parentOffset] |= flags;
}

Example #7

Source File: ColumnarMemoryStorePersister.java From kylin with Apache License 2.0

4 votes

/**
 * This method is used to persist the dimension data to disk file, first part is the dictionary, second part is the dimension value, third part is the index.
 *
 * @param dimValueList
 * @param dimensionMetaList
 * @param indexOut
 * @param dimension
 * @param dictMaps
 * @throws IOException
 */
private void persistDimension(long cuboidId, List<Object> dimValueList, List<DimensionMetaInfo> dimensionMetaList,
        CountingOutputStream indexOut, TblColRef dimension, Map<TblColRef, Dictionary<String>> dictMaps)
        throws IOException {
    Stopwatch stopwatch = Stopwatch.createUnstarted();
    stopwatch.start();

    DimensionMetaInfo dimensionMeta = new DimensionMetaInfo();
    dimensionMetaList.add(dimensionMeta);

    DimensionEncoding encoding;
    IIColumnDescriptor columnDescriptor;
    if (dimensionsUseDictEncoding.contains(dimension)) {
        Dictionary<String> dict = dictMaps.get(dimension);
        encoding = new DictionaryDimEnc(dict);
        if (dict instanceof TrieDictionary) {
            columnDescriptor = new SeqIIColumnDescriptor(dimension.getName(), dict.getMinId(), dict.getMaxId());
        } else {
            columnDescriptor = new FixLenIIColumnDescriptor(dimension.getName(), encoding.getLengthOfEncoding());
        }
    } else {
        RowKeyColDesc colDesc = cubeDesc.getRowkey().getColDesc(dimension);
        encoding = DimensionEncodingFactory.create(colDesc.getEncodingName(), colDesc.getEncodingArgs(),
                colDesc.getEncodingVersion());
        columnDescriptor = new FixLenIIColumnDescriptor(dimension.getName(), encoding.getLengthOfEncoding());
    }
    dimensionMeta.setName(dimension.getName());
    dimensionMeta.setStartOffset((int) indexOut.getCount());
    int fixEncodingLen = encoding.getLengthOfEncoding();

    DataOutputStream dataOut = new DataOutputStream(indexOut);
    ColumnarStoreDimDesc cStoreDimDesc = getColumnarStoreDimDesc(dimension, encoding);
    ColumnDataWriter columnDataWriter = cStoreDimDesc.getDimWriter(dataOut, dimValueList.size());

    //Raw values are stored on disk files with fixed length encoding to make it easy for inverted index to search and scan.
    for (Object cell : dimValueList) {
        byte[] fixLenBytes = new byte[fixEncodingLen];
        if (cell != null) {
            encoding.encode((String) cell, fixLenBytes, 0);
        } else {
            encoding.encode(null, fixLenBytes, 0);
            dimensionMeta.setHasNull(true);
        }
        columnDescriptor.getWriter().addValue(fixLenBytes);
        columnDataWriter.write(fixLenBytes);
    }
    columnDataWriter.flush();
    dimensionMeta.setDataLength(dataOut.size());
    columnDescriptor.getWriter().write(indexOut);
    dimensionMeta.setIndexLength((int) indexOut.getCount() - dimensionMeta.getStartOffset()
            - dimensionMeta.getDataLength());
    dimensionMeta.setCompression(cStoreDimDesc.getCompression().name());

    stopwatch.stop();
    if (logger.isDebugEnabled()) {
        logger.debug("cuboid-{} saved dimension:{}, took: {}ms", cuboidId, dimension.getName(),
                stopwatch.elapsed(MILLISECONDS));
    }
}

Example #8

Source File: MergeCuboidMapperTest.java From Kylin with Apache License 2.0

4 votes

@Before
public void setUp() throws Exception {

    createTestMetadata();

    logger.info("The metadataUrl is : " + getTestConfig());

    MetadataManager.clearCache();
    CubeManager.clearCache();
    ProjectManager.clearCache();
    DictionaryManager.clearCache();

    // hack for distributed cache
    // CubeManager.removeInstance(KylinConfig.createInstanceFromUri("../job/meta"));//to
    // make sure the following mapper could get latest CubeManger
    FileUtils.deleteDirectory(new File("../job/meta"));

    MergeCuboidMapper mapper = new MergeCuboidMapper();
    mapDriver = MapDriver.newMapDriver(mapper);

    cubeManager = CubeManager.getInstance(getTestConfig());
    cube = cubeManager.getCube("test_kylin_cube_without_slr_left_join_ready_2_segments");
    dictionaryManager = DictionaryManager.getInstance(getTestConfig());
    lfn = cube.getDescriptor().findColumnRef("DEFAULT.TEST_KYLIN_FACT", "LSTG_FORMAT_NAME");
    lsi = cube.getDescriptor().findColumnRef("DEFAULT.TEST_KYLIN_FACT", "CAL_DT");
    ssc = cube.getDescriptor().findColumnRef("DEFAULT.TEST_CATEGORY_GROUPINGS", "META_CATEG_NAME");

    DictionaryInfo sharedDict = makeSharedDict();

    boolean isFirstSegment = true;
    for (CubeSegment segment : cube.getSegments()) {

        TableSignature signature = new TableSignature();
        signature.setSize(100);
        signature.setLastModifiedTime(System.currentTimeMillis());
        signature.setPath("fake_dict_for" + lfn.getName() + segment.getName());

        DictionaryInfo newDictInfo = new DictionaryInfo(lfn.getTable(), lfn.getColumn().getName(), lfn.getColumn().getZeroBasedIndex(), "string", signature, "");

        List<byte[]> values = new ArrayList<byte[]>();
        values.add(new byte[] { 97, 97, 97 });
        if (isFirstSegment)
            values.add(new byte[] { 99, 99, 99 });
        else
            values.add(new byte[] { 98, 98, 98 });
        Dictionary<?> dict = DictionaryGenerator.buildDictionaryFromValueList(newDictInfo, values);
        dictionaryManager.trySaveNewDict(dict, newDictInfo);
        ((TrieDictionary) dict).dump(System.out);

        segment.putDictResPath(lfn, newDictInfo.getResourcePath());
        segment.putDictResPath(lsi, sharedDict.getResourcePath());
        segment.putDictResPath(ssc, sharedDict.getResourcePath());

        // cubeManager.saveResource(segment.getCubeInstance());
        // cubeManager.afterCubeUpdated(segment.getCubeInstance());
        cubeManager.updateCube(cube);

        isFirstSegment = false;
    }

}