org.apache.kylin.dict.DictionaryInfo Java Examples

The following examples show how to use org.apache.kylin.dict.DictionaryInfo. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SegmentAppendTrieDictBuilder.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
@Override
public void init(DictionaryInfo dictInfo, int baseId, String hdfsDir) throws IOException {
    sourceColumn = dictInfo.getSourceTable() + "." + dictInfo.getSourceColumn();

    KylinConfig config = KylinConfig.getInstanceFromEnv();
    int maxEntriesPerSlice = config.getAppendDictEntrySize();
    if (hdfsDir == null) {
        //build in Kylin job server
        hdfsDir = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory();
    }

    //use UUID to make each segment dict in different HDFS dir and support concurrent build
    //use timestamp to make the segment dict easily to delete
    String baseDir = hdfsDir + "resources/SegmentDict" + dictInfo.getResourceDir() + "/"
            + RandomUtil.randomUUID().toString() + "_" + System.currentTimeMillis() + "/";

    this.builder = new AppendTrieDictionaryBuilder(baseDir, maxEntriesPerSlice, false);
    this.baseId = baseId;
}
 
Example #2
Source File: IIManager.java    From Kylin with Apache License 2.0 6 votes vote down vote up
/**
 * return null if no dictionary for given column
 */
public Dictionary<?> getDictionary(IISegment iiSeg, TblColRef col) {
    DictionaryInfo info = null;
    try {
        DictionaryManager dictMgr = getDictionaryManager();
        // logger.info("Using metadata url " + metadataUrl +
        // " for DictionaryManager");
        String dictResPath = iiSeg.getDictResPath(col);
        if (dictResPath == null)
            return null;

        info = dictMgr.getDictionaryInfo(dictResPath);
        if (info == null)
            throw new IllegalStateException("No dictionary found by " + dictResPath + ", invalid II state; II segment" + iiSeg + ", col " + col);
    } catch (IOException e) {
        throw new IllegalStateException("Failed to get dictionary for II segment" + iiSeg + ", col" + col, e);
    }

    return info.getDictionaryObject();
}
 
Example #3
Source File: DictionaryManagerTest.java    From Kylin with Apache License 2.0 6 votes vote down vote up
@Test
@Ignore("hive not ready")
public void basic() throws Exception {
    CubeDesc cubeDesc = CubeDescManager.getInstance(getTestConfig()).getCubeDesc("test_kylin_cube_without_slr_desc");
    TblColRef col = cubeDesc.findColumnRef("DEFAULT.TEST_CATEGORY_GROUPINGS", "META_CATEG_NAME");

    DictionaryInfo info1 = dictMgr.buildDictionary(cubeDesc.getModel(), cubeDesc.getRowkey().getDictionary(col), col, null);
    System.out.println(JsonUtil.writeValueAsIndentString(info1));

    DictionaryInfo info2 = dictMgr.buildDictionary(cubeDesc.getModel(), cubeDesc.getRowkey().getDictionary(col), col, null);
    System.out.println(JsonUtil.writeValueAsIndentString(info2));

    assertTrue(info1.getUuid() == info2.getUuid());

    assertTrue(info1 == dictMgr.getDictionaryInfo(info1.getResourcePath()));
    assertTrue(info2 == dictMgr.getDictionaryInfo(info2.getResourcePath()));

    assertTrue(info1.getDictionaryObject() == info2.getDictionaryObject());

    touchDictValues(info1);
}
 
Example #4
Source File: CubeManager.java    From Kylin with Apache License 2.0 6 votes vote down vote up
/**
 * return null if no dictionary for given column
 */
public Dictionary<?> getDictionary(CubeSegment cubeSeg, TblColRef col) {
    DictionaryInfo info = null;
    try {
        DictionaryManager dictMgr = getDictionaryManager();
        // logger.info("Using metadata url " + metadataUrl +
        // " for DictionaryManager");
        String dictResPath = cubeSeg.getDictResPath(col);
        if (dictResPath == null)
            return null;

        info = dictMgr.getDictionaryInfo(dictResPath);
        if (info == null)
            throw new IllegalStateException("No dictionary found by " + dictResPath + ", invalid cube state; cube segment" + cubeSeg + ", col " + col);
    } catch (IOException e) {
        throw new IllegalStateException("Failed to get dictionary for cube segment" + cubeSeg + ", col" + col, e);
    }

    return info.getDictionaryObject();
}
 
Example #5
Source File: DumpDictionaryCLI.java    From Kylin with Apache License 2.0 6 votes vote down vote up
public static void dump(File f) throws IOException {
    if (f.isDirectory()) {
        for (File c : f.listFiles())
            dump(c);
        return;
    }

    if (f.getName().endsWith(".dict")) {
        DictionaryInfoSerializer ser = new DictionaryInfoSerializer();
        DictionaryInfo dictInfo = ser.deserialize(new DataInputStream(new FileInputStream(f)));

        System.out.println("============================================================================");
        System.out.println("File: " + f.getAbsolutePath());
        System.out.println(new Date(dictInfo.getLastModified()));
        System.out.println(JsonUtil.writeValueAsIndentString(dictInfo));
        dictInfo.getDictionaryObject().dump(System.out);
        System.out.println();
    }
}
 
Example #6
Source File: MergeCuboidMapperTest.java    From Kylin with Apache License 2.0 6 votes vote down vote up
private DictionaryInfo makeSharedDict() throws IOException {
    TableSignature signature = new TableSignature();
    signature.setSize(100);
    signature.setLastModifiedTime(System.currentTimeMillis());
    signature.setPath("fake_common_dict");

    DictionaryInfo newDictInfo = new DictionaryInfo("", "", 0, "string", signature, "");

    List<byte[]> values = new ArrayList<byte[]>();
    values.add(new byte[] { 101, 101, 101 });
    values.add(new byte[] { 102, 102, 102 });
    Dictionary<?> dict = DictionaryGenerator.buildDictionaryFromValueList(newDictInfo, values);
    dictionaryManager.trySaveNewDict(dict, newDictInfo);
    ((TrieDictionary) dict).dump(System.out);

    return newDictInfo;
}
 
Example #7
Source File: CubeManager.java    From kylin with Apache License 2.0 6 votes vote down vote up
private void saveDictionaryInfo(CubeSegment cubeSeg, TblColRef col, DictionaryInfo dictInfo)
        throws IOException {
    if (dictInfo == null)
        return;

    // work on copy instead of cached objects
    CubeInstance cubeCopy = cubeSeg.getCubeInstance().latestCopyForWrite(); // get a latest copy
    CubeSegment segCopy = cubeCopy.getSegmentById(cubeSeg.getUuid());

    Dictionary<?> dict = dictInfo.getDictionaryObject();
    segCopy.putDictResPath(col, dictInfo.getResourcePath());
    segCopy.getRowkeyStats().add(new Object[] { col.getIdentity(), dict.getSize(), dict.getSizeOfId() });

    CubeUpdate update = new CubeUpdate(cubeCopy);
    update.setToUpdateSegs(segCopy);
    updateCube(update);
}
 
Example #8
Source File: DumpDictionaryCLI.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static void dump(File f) throws IOException {
    if (f.isDirectory()) {
        File[] files = f.listFiles();
        if (files == null) {
            return;
        }
        for (File c : files)
            dump(c);
        return;
    }

    if (f.getName().endsWith(".dict")) {
        DictionaryInfoSerializer ser = new DictionaryInfoSerializer();
        DictionaryInfo dictInfo = ser.deserialize(new DataInputStream(new FileInputStream(f)));

        System.out.println("============================================================================");
        System.out.println("File: " + f.getAbsolutePath());
        System.out.println(new Date(dictInfo.getLastModified()));
        System.out.println(JsonUtil.writeValueAsIndentString(dictInfo));
        dictInfo.getDictionaryObject().dump(System.out);
        System.out.println();
    }
}
 
Example #9
Source File: CubingUtils.java    From kylin with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public static Map<TblColRef, Dictionary<String>> writeDictionary(CubeSegment cubeSegment,
        Map<TblColRef, Dictionary<String>> dictionaryMap, long startOffset, long endOffset) {
    Map<TblColRef, Dictionary<String>> realDictMap = Maps.newHashMap();

    for (Map.Entry<TblColRef, Dictionary<String>> entry : dictionaryMap.entrySet()) {
        final TblColRef tblColRef = entry.getKey();
        final Dictionary<String> dictionary = entry.getValue();
        IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
        signature.setLastModifiedTime(System.currentTimeMillis());
        signature.setPath(String.format(Locale.ROOT, "streaming_%s_%s", startOffset, endOffset));
        signature.setSize(endOffset - startOffset);
        DictionaryInfo dictInfo = new DictionaryInfo(tblColRef.getColumnDesc(), tblColRef.getDatatype(), signature);
        logger.info("writing dictionary for TblColRef:" + tblColRef.toString());
        DictionaryManager dictionaryManager = DictionaryManager.getInstance(cubeSegment.getCubeDesc().getConfig());
        try {
            DictionaryInfo realDict = dictionaryManager.trySaveNewDict(dictionary, dictInfo);
            cubeSegment.putDictResPath(tblColRef, realDict.getResourcePath());
            realDictMap.put(tblColRef, (Dictionary<String>) realDict.getDictionaryObject());
        } catch (IOException e) {
            throw new RuntimeException("error save dictionary for column:" + tblColRef, e);
        }
    }

    return realDictMap;
}
 
Example #10
Source File: MergeDictionaryStep.java    From kylin with Apache License 2.0 6 votes vote down vote up
/**
 * For the new segment, we need to create new dimension dictionaries by merging underlying
 * dictionaries. (https://issues.apache.org/jira/browse/KYLIN-2457, https://issues.apache.org/jira/browse/KYLIN-2800)
 * @param cube
 * @param newSeg
 * @throws IOException
 */
private void makeDictForNewSegment(KylinConfig conf, CubeInstance cube, CubeSegment newSeg, List<CubeSegment> mergingSegments) throws IOException {
    DictionaryManager dictMgr = DictionaryManager.getInstance(conf);
    CubeDesc cubeDesc = cube.getDescriptor();

    for (TblColRef col : cubeDesc.getAllColumnsNeedDictionaryBuilt()) {
        logger.info("Merging fact table dictionary on : " + col);
        List<DictionaryInfo> dictInfos = new ArrayList<DictionaryInfo>();
        for (CubeSegment segment : mergingSegments) {
            logger.info("Including fact table dictionary of segment : " + segment);
            if (segment.getDictResPath(col) != null) {
                DictionaryInfo dictInfo = dictMgr.getDictionaryInfo(segment.getDictResPath(col));
                if (dictInfo != null && !dictInfos.contains(dictInfo)) {
                    dictInfos.add(dictInfo);
                } else {
                    logger.warn("Failed to load DictionaryInfo from " + segment.getDictResPath(col));
                }
            }
        }
        mergeDictionaries(dictMgr, newSeg, dictInfos, col);
    }
}
 
Example #11
Source File: SegmentAppendTrieDictBuilder.java    From kylin with Apache License 2.0 6 votes vote down vote up
@Override
public void init(DictionaryInfo dictInfo, int baseId, String hdfsDir) throws IOException {
    sourceColumn = dictInfo.getSourceTable() + "." + dictInfo.getSourceColumn();

    KylinConfig config = KylinConfig.getInstanceFromEnv();
    int maxEntriesPerSlice = config.getAppendDictEntrySize();
    if (hdfsDir == null) {
        //build in Kylin job server
        hdfsDir = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory();
    }

    //use UUID to make each segment dict in different HDFS dir and support concurrent build
    //use timestamp to make the segment dict easily to delete
    String baseDir = hdfsDir + "resources/SegmentDict" + dictInfo.getResourceDir() + "/"
            + RandomUtil.randomUUID().toString() + "_" + System.currentTimeMillis() + "/";

    this.builder = new AppendTrieDictionaryBuilder(baseDir, maxEntriesPerSlice, false);
    this.baseId = baseId;
}
 
Example #12
Source File: CubeMigrationCrossClusterCLI.java    From kylin with Apache License 2.0 6 votes vote down vote up
private void copyDictionary(CubeInstance cube, String dictPath) throws IOException {
    if (dstCluster.exists(dictPath)) {
        logger.info("Item {} has already existed in destination cluster", dictPath);
        return;
    }
    DictionaryInfo dictInfo = srcCluster.getDictionaryInfo(dictPath);
    String dupDict = dstCluster.saveDictionary(dictInfo);
    if (dupDict != null) {
        for (CubeSegment segment : cube.getSegments()) {
            for (Map.Entry<String, String> entry : segment.getDictionaries().entrySet()) {
                if (entry.getValue().equalsIgnoreCase(dictPath)) {
                    entry.setValue(dupDict);
                }
            }
        }
        logger.info("Item {} is dup, instead {} is reused", dictPath, dupDict);
    }
}
 
Example #13
Source File: DstClusterUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
private String checkDupDict(DictionaryInfo dictInfo) throws IOException {
    NavigableSet<String> existings = resourceStore.listResources(dictInfo.getResourceDir());
    if (existings == null)
        return null;

    logger.info("{} existing dictionaries of the same column", existings.size());
    if (existings.size() > 100) {
        logger.warn("Too many dictionaries under {}, dict count: {}", dictInfo.getResourceDir(), existings.size());
    }

    for (String existing : existings) {
        DictionaryInfo existingInfo = getDictionaryInfo(existing);
        if (existingInfo != null && dictInfo.getDictionaryObject().equals(existingInfo.getDictionaryObject())) {
            return existing;
        }
    }

    return null;
}
 
Example #14
Source File: CubeManager.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void saveDictionaryInfo(CubeSegment cubeSeg, TblColRef col, DictionaryInfo dictInfo)
        throws IOException {
    if (dictInfo == null)
        return;

    // work on copy instead of cached objects
    CubeInstance cubeCopy = cubeSeg.getCubeInstance().latestCopyForWrite(); // get a latest copy
    CubeSegment segCopy = cubeCopy.getSegmentById(cubeSeg.getUuid());

    Dictionary<?> dict = dictInfo.getDictionaryObject();
    segCopy.putDictResPath(col, dictInfo.getResourcePath());
    segCopy.getRowkeyStats().add(new Object[] { col.getIdentity(), dict.getSize(), dict.getSizeOfId() });

    CubeUpdate update = new CubeUpdate(cubeCopy);
    update.setToUpdateSegs(segCopy);
    updateCube(update);
}
 
Example #15
Source File: MergeDictionaryStep.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
/**
 * For the new segment, we need to create new dimension dictionaries by merging underlying
 * dictionaries. (https://issues.apache.org/jira/browse/KYLIN-2457, https://issues.apache.org/jira/browse/KYLIN-2800)
 * @param cube
 * @param newSeg
 * @throws IOException
 */
private void makeDictForNewSegment(KylinConfig conf, CubeInstance cube, CubeSegment newSeg, List<CubeSegment> mergingSegments) throws IOException {
    DictionaryManager dictMgr = DictionaryManager.getInstance(conf);
    CubeDesc cubeDesc = cube.getDescriptor();

    for (TblColRef col : cubeDesc.getAllColumnsNeedDictionaryBuilt()) {
        logger.info("Merging fact table dictionary on : " + col);
        List<DictionaryInfo> dictInfos = new ArrayList<DictionaryInfo>();
        for (CubeSegment segment : mergingSegments) {
            logger.info("Including fact table dictionary of segment : " + segment);
            if (segment.getDictResPath(col) != null) {
                DictionaryInfo dictInfo = dictMgr.getDictionaryInfo(segment.getDictResPath(col));
                if (dictInfo != null && !dictInfos.contains(dictInfo)) {
                    dictInfos.add(dictInfo);
                } else {
                    logger.warn("Failed to load DictionaryInfo from " + segment.getDictResPath(col));
                }
            }
        }
        mergeDictionaries(dictMgr, newSeg, dictInfos, col);
    }
}
 
Example #16
Source File: CubingUtils.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public static Map<TblColRef, Dictionary<String>> writeDictionary(CubeSegment cubeSegment,
        Map<TblColRef, Dictionary<String>> dictionaryMap, long startOffset, long endOffset) {
    Map<TblColRef, Dictionary<String>> realDictMap = Maps.newHashMap();

    for (Map.Entry<TblColRef, Dictionary<String>> entry : dictionaryMap.entrySet()) {
        final TblColRef tblColRef = entry.getKey();
        final Dictionary<String> dictionary = entry.getValue();
        IReadableTable.TableSignature signature = new IReadableTable.TableSignature();
        signature.setLastModifiedTime(System.currentTimeMillis());
        signature.setPath(String.format(Locale.ROOT, "streaming_%s_%s", startOffset, endOffset));
        signature.setSize(endOffset - startOffset);
        DictionaryInfo dictInfo = new DictionaryInfo(tblColRef.getColumnDesc(), tblColRef.getDatatype(), signature);
        logger.info("writing dictionary for TblColRef:" + tblColRef.toString());
        DictionaryManager dictionaryManager = DictionaryManager.getInstance(cubeSegment.getCubeDesc().getConfig());
        try {
            DictionaryInfo realDict = dictionaryManager.trySaveNewDict(dictionary, dictInfo);
            cubeSegment.putDictResPath(tblColRef, realDict.getResourcePath());
            realDictMap.put(tblColRef, (Dictionary<String>) realDict.getDictionaryObject());
        } catch (IOException e) {
            throw new RuntimeException("error save dictionary for column:" + tblColRef, e);
        }
    }

    return realDictMap;
}
 
Example #17
Source File: DumpDictionaryCLI.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static void dump(File f) throws IOException {
    if (f.isDirectory()) {
        File[] files = f.listFiles();
        if (files == null) {
            return;
        }
        for (File c : files)
            dump(c);
        return;
    }

    if (f.getName().endsWith(".dict")) {
        DictionaryInfoSerializer ser = new DictionaryInfoSerializer();
        DictionaryInfo dictInfo = ser.deserialize(new DataInputStream(new FileInputStream(f)));

        System.out.println("============================================================================");
        System.out.println("File: " + f.getAbsolutePath());
        System.out.println(new Date(dictInfo.getLastModified()));
        System.out.println(JsonUtil.writeValueAsIndentString(dictInfo));
        dictInfo.getDictionaryObject().dump(System.out);
        System.out.println();
    }
}
 
Example #18
Source File: CubeManager.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
/**
 * return null if no dictionary for given column
 */
@SuppressWarnings("unchecked")
public Dictionary<String> getDictionary(CubeSegment cubeSeg, TblColRef col) {
    DictionaryInfo info = null;
    String dictResPath = null;
    try {
        DictionaryManager dictMgr = getDictionaryManager();

        //tiretree global domain dic
        List<CubeDescTiretreeGlobalDomainDictUtil.GlobalDict> globalDicts = cubeSeg.getCubeDesc().listDomainDict();
        if (!globalDicts.isEmpty()) {
            dictResPath = CubeDescTiretreeGlobalDomainDictUtil.globalReuseDictPath(cubeSeg.getConfig(), col, cubeSeg.getCubeDesc());
        }

        if (Objects.isNull(dictResPath)){
            dictResPath = cubeSeg.getDictResPath(col);
        }

        if (dictResPath == null)
            return null;

        info = dictMgr.getDictionaryInfo(dictResPath);
        if (info == null)
            throw new IllegalStateException("No dictionary found by " + dictResPath
                    + ", invalid cube state; cube segment" + cubeSeg + ", col " + col);
    } catch (IOException e) {
        throw new IllegalStateException("Failed to get dictionary for cube segment" + cubeSeg + ", col" + col,
                e);
    }
    return (Dictionary<String>) info.getDictionaryObject();
}
 
Example #19
Source File: IIManager.java    From Kylin with Apache License 2.0 5 votes vote down vote up
public void buildInvertedIndexDictionary(IISegment iiSeg, String factColumnsPath) throws IOException {
    logger.info("Start building ii dictionary");
    DictionaryManager dictMgr = getDictionaryManager();
    IIDesc iiDesc = iiSeg.getIIInstance().getDescriptor();
    for (TblColRef column : iiDesc.listAllColumns()) {
        logger.info("Dealing with column {}", column);
        if (iiDesc.isMetricsCol(column)) {
            continue;
        }

        DictionaryInfo dict = dictMgr.buildDictionary(iiDesc.getModel(), "true", column, factColumnsPath);
        iiSeg.putDictResPath(column, dict.getResourcePath());
    }
    saveResource(iiSeg.getIIInstance());
}
 
Example #20
Source File: DictionaryManagerTest.java    From Kylin with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
private void touchDictValues(DictionaryInfo info1) {
    Dictionary<String> dict = (Dictionary<String>) info1.getDictionaryObject();

    HashSet<String> set = new HashSet<String>();
    for (int i = 0, n = info1.getCardinality(); i < n; i++) {
        set.add(dict.getValueFromId(i));
    }
    assertEquals(info1.getCardinality(), set.size());
}
 
Example #21
Source File: CubeManager.java    From Kylin with Apache License 2.0 5 votes vote down vote up
public DictionaryInfo buildDictionary(CubeSegment cubeSeg, TblColRef col, String factColumnsPath) throws IOException {
    CubeDesc cubeDesc = cubeSeg.getCubeDesc();
    if (!cubeDesc.getRowkey().isUseDictionary(col))
        return null;

    DictionaryManager dictMgr = getDictionaryManager();
    DictionaryInfo dictInfo = dictMgr.buildDictionary(cubeDesc.getModel(), cubeDesc.getRowkey().getDictionary(col), col, factColumnsPath);
    cubeSeg.putDictResPath(col, dictInfo.getResourcePath());

    saveResource(cubeSeg.getCubeInstance());

    return dictInfo;
}
 
Example #22
Source File: MergeDictionaryStep.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private DictionaryInfo mergeDictionaries(DictionaryManager dictMgr, CubeSegment cubeSeg, List<DictionaryInfo> dicts, TblColRef col) throws IOException {
    DictionaryInfo dictInfo = dictMgr.mergeDictionary(dicts);
    if (dictInfo != null)
        cubeSeg.putDictResPath(col, dictInfo.getResourcePath());

    return dictInfo;
}
 
Example #23
Source File: UHCDictionaryReducer.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}
 
Example #24
Source File: SparkBuildDictionary.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<String, Tuple3<String, Integer, Integer>> call(TblColRef tblColRef) throws Exception {
    if (initialized == false) {
        synchronized (SparkBuildDictionary.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    logger.info("Building dictionary for column {}", tblColRef);
    IReadableTable inpTable = getDistinctValuesFor(tblColRef);
    Dictionary<String> preBuiltDict;
    DictionaryInfo dictInfo;
    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(config)) {
        preBuiltDict = getDictionary(tblColRef);

        if (preBuiltDict != null) {
            logger.info("Dict for '{}' has already been built, save it", tblColRef.getName());
            dictInfo = dictManager.saveDictionary(tblColRef, inpTable, preBuiltDict);
        } else {
            logger.info("Dict for '{}' not pre-built, build it from {}", tblColRef.getName(), inpTable);
            String builderClass = cubeSegment.getCubeDesc().getDictionaryBuilderClass(tblColRef);
            dictInfo = dictManager.buildDictionary(tblColRef, inpTable, builderClass);
            preBuiltDict = dictInfo.getDictionaryObject();
        }
    }

    return new Tuple2<>(tblColRef.getIdentity(),
            new Tuple3<>(dictInfo.getResourcePath(), preBuiltDict.getSize(), preBuiltDict.getSizeOfId()));
}
 
Example #25
Source File: SparkUHCDictionary.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<String, Tuple3<Writable, Writable, String>> call(Tuple2<Integer, List<String>> columnValues) throws Exception {
    if (initialized == false) {
        synchronized (SparkFactDistinct.class) {
            if (initialized == false) {
                init();
            }
        }
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig.setAndUnsetThreadLocalConfig(config);
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutputStream outputStream = new DataOutputStream(baos)) {
        TblColRef col = uhcColumns.get(columnValues._1);
        logger.info("Processing column " + col.getName());
        if (cube.getDescriptor().getShardByColumns().contains(col)) {
            //for ShardByColumns
            builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
            builder.init(null, 0, null);
        } else {
            //for GlobalDictionaryColumns
            DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
            String builderClass = cubeDesc.getDictionaryBuilderClass(col);
            builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
            builder.init(dictionaryInfo, 0, hdfsDir);
        }
        Iterator<String> values = columnValues._2.iterator();
        while (values.hasNext()) {
            builder.addValue(values.next());
        }
        Dictionary<String> dict = builder.build();
        String dictFileName = col.getIdentity() + "/" + col.getName() + DICT_FILE_POSTFIX;
        logger.info("Dictionary file name is " + dictFileName);

        outputStream.writeUTF(dict.getClass().getName());
        dict.write(outputStream);
        Tuple3 tuple3 = new Tuple3(NullWritable.get(), new ArrayPrimitiveWritable(baos.toByteArray()), dictFileName);
        return new Tuple2<>(BatchConstants.CFG_OUTPUT_DICT, tuple3);
    }
}
 
Example #26
Source File: CubeManager.java    From kylin with Apache License 2.0 5 votes vote down vote up
/**
 * return null if no dictionary for given column
 */
@SuppressWarnings("unchecked")
public Dictionary<String> getDictionary(CubeSegment cubeSeg, TblColRef col) {
    DictionaryInfo info = null;
    String dictResPath = null;
    try {
        DictionaryManager dictMgr = getDictionaryManager();

        //tiretree global domain dic
        List<CubeDescTiretreeGlobalDomainDictUtil.GlobalDict> globalDicts = cubeSeg.getCubeDesc().listDomainDict();
        if (!globalDicts.isEmpty()) {
            dictResPath = CubeDescTiretreeGlobalDomainDictUtil.globalReuseDictPath(cubeSeg.getConfig(), col, cubeSeg.getCubeDesc());
        }

        if (Objects.isNull(dictResPath)){
            dictResPath = cubeSeg.getDictResPath(col);
        }

        if (dictResPath == null)
            return null;

        info = dictMgr.getDictionaryInfo(dictResPath);
        if (info == null)
            throw new IllegalStateException("No dictionary found by " + dictResPath
                    + ", invalid cube state; cube segment" + cubeSeg + ", col " + col);
    } catch (IOException e) {
        throw new IllegalStateException("Failed to get dictionary for cube segment" + cubeSeg + ", col" + col,
                e);
    }
    return info.getDictionaryObject();
}
 
Example #27
Source File: CubeManager.java    From kylin with Apache License 2.0 5 votes vote down vote up
public DictionaryInfo saveDictionary(CubeSegment cubeSeg, TblColRef col, IReadableTable inpTable,
        Dictionary<String> dict) throws IOException {
    CubeDesc cubeDesc = cubeSeg.getCubeDesc();
    if (!cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col))
        return null;

    DictionaryInfo dictInfo = getDictionaryManager().saveDictionary(col, inpTable, dict);

    saveDictionaryInfo(cubeSeg, col, dictInfo);
    return dictInfo;
}
 
Example #28
Source File: CubeManager.java    From kylin with Apache License 2.0 5 votes vote down vote up
public DictionaryInfo buildDictionary(CubeSegment cubeSeg, TblColRef col, IReadableTable inpTable)
        throws IOException {
    CubeDesc cubeDesc = cubeSeg.getCubeDesc();
    if (!cubeDesc.getAllColumnsNeedDictionaryBuilt().contains(col))
        return null;

    String builderClass = cubeDesc.getDictionaryBuilderClass(col);
    DictionaryInfo dictInfo = getDictionaryManager().buildDictionary(col, inpTable, builderClass);

    saveDictionaryInfo(cubeSeg, col, dictInfo);
    return dictInfo;
}
 
Example #29
Source File: UHCDictionaryReducer.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    super.bindCurrentConfiguration(context.getConfiguration());
    Configuration conf = context.getConfiguration();
    mos = new MultipleOutputs(context);

    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();
    String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME);
    CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName);
    CubeDesc cubeDesc = cube.getDescriptor();
    List<TblColRef> uhcColumns = cubeDesc.getAllUHCColumns();

    int taskId = context.getTaskAttemptID().getTaskID().getId();
    col = uhcColumns.get(taskId);
    logger.info("column name: " + col.getIdentity());

    if (cube.getDescriptor().getShardByColumns().contains(col)) {
        //for ShardByColumns
        builder = DictionaryGenerator.newDictionaryBuilder(col.getType());
        builder.init(null, 0, null);
    } else {
        //for GlobalDictionaryColumns
        String hdfsDir = conf.get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);
        DictionaryInfo dictionaryInfo = new DictionaryInfo(col.getColumnDesc(), col.getDatatype());
        String builderClass = cubeDesc.getDictionaryBuilderClass(col);
        builder = (IDictionaryBuilder) ClassUtil.newInstance(builderClass);
        builder.init(dictionaryInfo, 0, hdfsDir);
    }
}
 
Example #30
Source File: MergeDictionaryStep.java    From kylin with Apache License 2.0 5 votes vote down vote up
private DictionaryInfo mergeDictionaries(DictionaryManager dictMgr, CubeSegment cubeSeg, List<DictionaryInfo> dicts, TblColRef col) throws IOException {
    DictionaryInfo dictInfo = dictMgr.mergeDictionary(dicts);
    if (dictInfo != null)
        cubeSeg.putDictResPath(col, dictInfo.getResourcePath());

    return dictInfo;
}