Java Code Examples for org.apache.kylin.common.util.HadoopUtil#getWorkingFileSystem()

The following examples show how to use org.apache.kylin.common.util.HadoopUtil#getWorkingFileSystem() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: CreateHTableJob.java From kylin with Apache License 2.0

6 votes

private void exportHBaseConfiguration(String hbaseTableName) throws IOException {

        Configuration hbaseConf = HBaseConnection.getCurrentHBaseConfiguration();
        HadoopUtil.healSickConfig(hbaseConf);
        Job job = Job.getInstance(hbaseConf, hbaseTableName);
        HTable table = new HTable(hbaseConf, hbaseTableName);
        HFileOutputFormat3.configureIncrementalLoadMap(job, table);

        logger.info("Saving HBase configuration to {}", hbaseConfPath);
        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        FSDataOutputStream out = null;
        try {
            out = fs.create(new Path(hbaseConfPath));
            job.getConfiguration().writeXml(out);
        } finally {
            IOUtils.closeQuietly(out);
        }
    }

Example 2

Source File: KylinHealthCheckJob.java From kylin with Apache License 2.0

6 votes

private void checkSegmentHDFSPath(List<CubeInstance> cubes) throws IOException {
    reporter.log("## Fix missing HDFS path of segments");
    FileSystem defaultFs = HadoopUtil.getWorkingFileSystem();
    for (CubeInstance cube : cubes) {
        for (CubeSegment segment : cube.getSegments()) {
            String jobUuid = segment.getLastBuildJobID();
            if (jobUuid != null && jobUuid.equals("") == false) {
                String path = JobBuilderSupport.getJobWorkingDir(config.getHdfsWorkingDirectory(), jobUuid);
                if (!defaultFs.exists(new Path(path))) {
                    reporter.log(
                            "Project: {} cube: {} segment: {} cube id data: {} don't exist and need to rebuild it",
                            cube.getProject(), cube.getName(), segment, path);
                    reporter.log(
                            "The rebuild url: -d '{\"startTime\":{}, \"endTime\":{}, \"buildType\":\"REFRESH\"}' /kylin/api/cubes/{}/build",
                            segment.getTSRange().start, segment.getTSRange().end, cube.getName());
                }
            }
        }
    }
}

Example 3

Source File: CreateHTableJob.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private void exportHBaseConfiguration(String hbaseTableName) throws IOException {

        Configuration hbaseConf = HBaseConnection.getCurrentHBaseConfiguration();
        HadoopUtil.healSickConfig(hbaseConf);
        Job job = Job.getInstance(hbaseConf, hbaseTableName);
        HTable table = new HTable(hbaseConf, hbaseTableName);
        HFileOutputFormat3.configureIncrementalLoadMap(job, table);

        logger.info("Saving HBase configuration to {}", hbaseConfPath);
        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        FSDataOutputStream out = null;
        try {
            out = fs.create(new Path(hbaseConfPath));
            job.getConfiguration().writeXml(out);
        } finally {
            IOUtils.closeQuietly(out);
        }
    }

Example 4

Source File: HiveColumnCardinalityUpdateJob.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private static List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = HadoopUtil.getWorkingFileSystem();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        for (String str : StringUtil.split(raw, "\n")) {
            results.add(str);
        }
    }
    return results;
}

Example 5

Source File: GarbageCollectionStep.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private void rmdirOnHDFS(List<String> paths) throws IOException {
    for (String path : paths) {
        Path externalDataPath = new Path(path);
        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        if (fs.exists(externalDataPath)) {
            fs.delete(externalDataPath, true);
        }
    }
}

Example 6

Source File: BuildCubeWithStreamV2.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public static void beforeClass(String confDir) throws Exception {
    logger.info("Adding to classpath: " + new File(confDir).getAbsolutePath());
    ClassUtil.addClasspath(new File(confDir).getAbsolutePath());

    System.setProperty(KylinConfig.KYLIN_CONF, confDir);
    System.setProperty("kylin.hadoop.conf.dir", confDir);
    if (StringUtils.isEmpty(System.getProperty("hdp.version"))) {
        throw new RuntimeException(
                "No hdp.version set; Please set hdp.version in your jvm option, for example: -Dhdp.version=2.4.0.0-169");
    }

    // use mocked stream data search client
    System.setProperty("kylin.stream.stand-alone.mode", "true");

    // DeployUtil.deployMetadata();

    // setup cube conn and h2 conn
    setupAll();

    try {
        //check hdfs permission
        FileSystem fileSystem = HadoopUtil.getWorkingFileSystem();
        String hdfsWorkingDirectory = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory();
        Path coprocessorDir = new Path(hdfsWorkingDirectory);
        boolean success = fileSystem.mkdirs(coprocessorDir);
        if (!success) {
            throw new IOException("mkdir fails, please check hdfs permission");
        }
    } catch (IOException e) {
        throw new RuntimeException(
                "failed to create kylin.env.hdfs-working-dir, Please make sure the user has right to access "
                        + KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(),
                e);
    }

    cleanStreamZkRoot();
}

Example 7

Source File: AbstractHadoopJob.java From kylin with Apache License 2.0

5 votes

public static int addInputDirs(String[] inputs, Job job) throws IOException {
    int ret = 0;//return number of added folders
    for (String inp : inputs) {
        inp = inp.trim();
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
            Path path = new Path(inp);

            if (!exists(fs, path)) {
                logger.warn("Path not exist:" + path.toString());
                continue;
            }

            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                ret += addInputDirs(new String[] { path.toString() }, job);
            }
        } else {
            logger.trace("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
            ret++;
        }
    }
    return ret;
}

Example 8

Source File: AbstractHadoopJob.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

public static int addInputDirs(String[] inputs, Job job) throws IOException {
    int ret = 0;//return number of added folders
    for (String inp : inputs) {
        inp = inp.trim();
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
            Path path = new Path(inp);

            if (!exists(fs, path)) {
                logger.warn("Path not exist:" + path.toString());
                continue;
            }

            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                ret += addInputDirs(new String[] { path.toString() }, job);
            }
        } else {
            logger.trace("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
            ret++;
        }
    }
    return ret;
}

Example 9

Source File: Repartitioner.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public void doRepartition(NSparkCubingEngine.NSparkCubingStorage storage, String path, int repartitionNum, Column[] sortCols, SparkSession ss)
        throws IOException {
    String tempPath = path + tempDirSuffix;
    Path tempResourcePath = new Path(tempPath);

    FileSystem readFileSystem = HadoopUtil.getWorkingFileSystem();
    if (needRepartition()) {
        // repartition and write to target path
        logger.info("Start repartition and rewrite");
        long start = System.currentTimeMillis();
        Dataset<Row> data;

        if (needRepartitionForShardByColumns()) {
            ss.sessionState().conf().setLocalProperty("spark.sql.adaptive.enabled", "false");
            data = storage.getFrom(tempPath, ss).repartition(repartitionNum,
                    NSparkCubingUtil.getColumns(getShardByColumns()))
                    .sortWithinPartitions(sortCols);
        } else {
            // repartition for single file size is too small
            logger.info("repartition to {}", repartitionNum);
            data = storage.getFrom(tempPath, ss).repartition(repartitionNum)
                    .sortWithinPartitions(sortCols);
        }

        storage.saveTo(path, data, ss);
        if (needRepartitionForShardByColumns()) {
            ss.sessionState().conf().setLocalProperty("spark.sql.adaptive.enabled", null);
        }
        if (readFileSystem.delete(tempResourcePath, true)) {
            logger.info("Delete temp cuboid path successful. Temp path: {}.", tempPath);
        } else {
            logger.error("Delete temp cuboid path wrong, leave garbage. Temp path: {}.", tempPath);
        }
        long end = System.currentTimeMillis();
        logger.info("Repartition and rewrite ends. Cost: {} ms.", end - start);
    } else {
        Path goalPath = new Path(path);
        if (readFileSystem.exists(goalPath)) {
            logger.info("Path {} is exists, delete it.", goalPath);
            readFileSystem.delete(goalPath, true);
        }
        if (readFileSystem.rename(new Path(tempPath), goalPath)) {
            logger.info("Rename temp path to target path successfully. Temp path: {}, target path: {}.", tempPath,
                    path);
        } else {
            throw new RuntimeException(String.format(Locale.ROOT,
                    "Rename temp path to target path wrong. Temp path: %s, target path: %s.", tempPath, path));
        }
    }
}

Example 10

Source File: CubeBuildJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

private String maxLeafTasksNums(Path shareDir) throws IOException {
    FileSystem fs = HadoopUtil.getWorkingFileSystem();
    FileStatus[] fileStatuses = fs.listStatus(shareDir,
            path -> path.toString().endsWith(ResourceDetectUtils.cubingDetectItemFileSuffix()));
    return ResourceDetectUtils.selectMaxValueInFiles(fileStatuses);
}

Example 11

Source File: CubeMigrationCLI.java From kylin with Apache License 2.0

4 votes

public void moveCube(KylinConfig srcCfg, KylinConfig dstCfg, String cubeName, String projectName, boolean copyAcl,
        boolean purgeAndDisable, boolean overwriteIfExists, boolean realExecute, boolean migrateSegment)
        throws IOException, InterruptedException {
    doAclCopy = copyAcl;
    doOverwrite = overwriteIfExists;
    doMigrateSegment = migrateSegment;
    srcConfig = srcCfg;
    srcStore = ResourceStore.getStore(srcConfig);
    dstConfig = dstCfg;
    dstStore = ResourceStore.getStore(dstConfig);
    dstProject = projectName;

    CubeManager cubeManager = CubeManager.getInstance(srcConfig);
    CubeInstance cube = cubeManager.getCube(cubeName);
    logger.info("cube to be moved is : " + cubeName);

    if (migrateSegment) {
        checkCubeState(cube);
    }

    checkAndGetHbaseUrl();

    Configuration conf = HBaseConnection.getCurrentHBaseConfiguration();
    hbaseAdmin = new HBaseAdmin(conf);
    hdfsFS = HadoopUtil.getWorkingFileSystem();
    operations = new ArrayList<Opt>();
    copyFilesInMetaStore(cube);
    if (migrateSegment) {
        renameFoldersInHdfs(cube);
        changeHtableHost(cube);
    } else {
        clearSegments(cubeName); // this should be after copyFilesInMetaStore
    }
    addCubeAndModelIntoProject(cube, cubeName);

    if (migrateSegment && purgeAndDisable) {
        purgeAndDisable(cubeName); // this should be the last action
    }

    if (realExecute) {
        doOpts();
        if (migrateSegment) {
            checkMigrationSuccess(dstConfig, cubeName, true);
        }
        updateMeta(dstConfig, projectName, cubeName, cube.getModel());
    } else {
        showOpts();
    }
}

Example 12

Source File: StorageCleanupJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public StorageCleanupJob() throws IOException {
    this(KylinConfig.getInstanceFromEnv(), HadoopUtil.getWorkingFileSystem(), HBaseConnection
            .getFileSystemInHBaseCluster(KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()));
}

Example 13

Source File: CubeMigrationCLI.java From kylin with Apache License 2.0

4 votes

public static void moveCube(KylinConfig srcCfg, KylinConfig dstCfg, String cubeName, String projectName,
        String copyAcl, String purgeAndDisable, String overwriteIfExists, String realExecute)
        throws IOException, InterruptedException {

    srcConfig = srcCfg;
    srcStore = ResourceStore.getStore(srcConfig);
    dstConfig = dstCfg;
    dstStore = ResourceStore.getStore(dstConfig);

    CubeManager cubeManager = CubeManager.getInstance(srcConfig);
    CubeInstance cube = cubeManager.getCube(cubeName);
    logger.info("cube to be moved is : " + cubeName);

    if (cube.getStatus() != RealizationStatusEnum.READY)
        throw new IllegalStateException("Cannot migrate cube that is not in READY state.");

    for (CubeSegment segment : cube.getSegments()) {
        if (segment.getStatus() != SegmentStatusEnum.READY) {
            throw new IllegalStateException("At least one segment is not in READY state");
        }
    }

    checkAndGetHbaseUrl();

    Connection conn = HBaseConnection.get(srcConfig.getStorageUrl());
    hbaseAdmin = conn.getAdmin();

    hdfsFS = HadoopUtil.getWorkingFileSystem();

    operations = new ArrayList<Opt>();

    copyFilesInMetaStore(cube, overwriteIfExists);
    renameFoldersInHdfs(cube);
    changeHtableHost(cube);
    addCubeAndModelIntoProject(cube, cubeName, projectName);
    if (Boolean.parseBoolean(copyAcl) == true) {
        copyACL(cube, projectName);
    }

    if (Boolean.parseBoolean(purgeAndDisable) == true) {
        purgeAndDisable(cubeName); // this should be the last action
    }

    if (realExecute.equalsIgnoreCase("true")) {
        doOpts();
        checkMigrationSuccess(dstConfig, cubeName, true);
        updateMeta(dstConfig);
    } else {
        showOpts();
    }
}

Example 14

Source File: BuildCubeWithEngine.java From kylin with Apache License 2.0

4 votes

public static void beforeClass(String confDir) throws Exception {
    logger.info("Adding to classpath: " + new File(confDir).getAbsolutePath());
    ClassUtil.addClasspath(new File(confDir).getAbsolutePath());

    fastBuildMode = isFastBuildMode();
    simpleBuildMode = isSimpleBuildMode();
    if (fastBuildMode) {
        logger.info("Will use fast build mode");
    }
    if (simpleBuildMode) {
        logger.info("Will use simple build mode");
    }

    String specifiedEngineType = System.getProperty("engineType");
    if (StringUtils.isNotEmpty(specifiedEngineType)) {
        engineType = Integer.parseInt(specifiedEngineType);
    }

    System.setProperty(KylinConfig.KYLIN_CONF, confDir);
    System.setProperty("SPARK_HOME", "/usr/local/spark"); // need manually create and put spark to this folder on Jenkins
    System.setProperty("kylin.hadoop.conf.dir", confDir);
    if (StringUtils.isEmpty(System.getProperty("hdp.version"))) {
        throw new RuntimeException(
                "No hdp.version set; Please set hdp.version in your jvm option, for example: -Dhdp.version=2.4.0.0-169");
    }

    HBaseMetadataTestCase.staticCreateTestMetadata(confDir);

    try {
        //check hdfs permission
        FileSystem fileSystem = HadoopUtil.getWorkingFileSystem();
        String hdfsWorkingDirectory = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory();
        Path coprocessorDir = new Path(hdfsWorkingDirectory);
        boolean success = fileSystem.mkdirs(coprocessorDir);
        if (!success) {
            throw new IOException("mkdir fails");
        }
    } catch (IOException e) {
        throw new RuntimeException(
                "failed to create kylin.env.hdfs-working-dir, Please make sure the user has right to access "
                        + KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(),
                e);
    }
}

Example 15

Source File: AbstractHadoopJob.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

private void setJobTmpJarsAndFiles(Job job, String kylinDependency) {
    if (StringUtils.isBlank(kylinDependency))
        return;

    logger.trace("setJobTmpJarsAndFiles: " + kylinDependency);

    try {
        Configuration jobConf = job.getConfiguration();
        FileSystem localfs = FileSystem.getLocal(jobConf);
        FileSystem hdfs = HadoopUtil.getWorkingFileSystem(jobConf);

        StringBuilder jarList = new StringBuilder();
        StringBuilder fileList = new StringBuilder();

        for (String fileName : StringUtil.splitAndTrim(kylinDependency, ",")) {
            Path p = new Path(fileName);
            if (p.isAbsolute() == false) {
                logger.warn("The directory of kylin dependency '" + fileName + "' is not absolute, skip");
                continue;
            }
            FileSystem fs;
            if (exists(hdfs, p)) {
                fs = hdfs;
            } else if (exists(localfs, p)) {
                fs = localfs;
            } else {
                logger.warn("The directory of kylin dependency '" + fileName + "' does not exist, skip");
                continue;
            }

            if (fs.getFileStatus(p).isDirectory()) {
                logger.trace("Expanding depedency directory: " + p);
                appendTmpDir(job, fs, p, jarList, fileList);
                continue;
            }

            StringBuilder list = (p.getName().endsWith(".jar")) ? jarList : fileList;
            if (list.length() > 0)
                list.append(",");
            list.append(fs.getFileStatus(p).getPath());
        }

        appendTmpFiles(fileList.toString(), jobConf);
        appendTmpJars(jarList.toString(), jobConf);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

Example 16

Source File: SaveStatisticsStep.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    CubeSegment newSegment = CubingExecutableUtil.findSegment(context,
            CubingExecutableUtil.getCubeName(this.getParams()),
            CubingExecutableUtil.getSegmentId(this.getParams()));
    KylinConfig kylinConf = newSegment.getConfig();

    ResourceStore rs = ResourceStore.getStore(kylinConf);
    try {

        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        Configuration hadoopConf = HadoopUtil.getCurrentConfiguration();
        Path statisticsDir = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        Path[] statisticsFiles = HadoopUtil.getFilteredPath(fs, statisticsDir,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDir);
        }

        Map<Long, HLLCounter> cuboidHLLMap = Maps.newHashMap();
        long totalRowsBeforeMerge = 0;
        long grantTotal = 0;
        int samplingPercentage = -1;
        int mapperNumber = -1;
        for (Path item : statisticsFiles) {
            CubeStatsReader.CubeStatsResult cubeStatsResult = new CubeStatsReader.CubeStatsResult(item,
                    kylinConf.getCubeStatsHLLPrecision());
            cuboidHLLMap.putAll(cubeStatsResult.getCounterMap());
            long pGrantTotal = 0L;
            for (HLLCounter hll : cubeStatsResult.getCounterMap().values()) {
                pGrantTotal += hll.getCountEstimate();
            }
            totalRowsBeforeMerge += pGrantTotal * cubeStatsResult.getMapperOverlapRatio();
            grantTotal += pGrantTotal;
            int pMapperNumber = cubeStatsResult.getMapperNumber();
            if (pMapperNumber > 0) {
                if (mapperNumber < 0) {
                    mapperNumber = pMapperNumber;
                } else {
                    throw new RuntimeException(
                            "Base cuboid has been distributed to multiple reducers at step FactDistinctColumnsReducer!!!");
                }
            }
            int pSamplingPercentage = cubeStatsResult.getPercentage();
            if (samplingPercentage < 0) {
                samplingPercentage = pSamplingPercentage;
            } else if (samplingPercentage != pSamplingPercentage) {
                throw new RuntimeException(
                        "The sampling percentage should be same among all of the reducer of FactDistinctColumnsReducer!!!");
            }
        }
        if (samplingPercentage < 0) {
            logger.warn("The sampling percentage should be set!!!");
        }
        if (mapperNumber < 0) {
            logger.warn("The mapper number should be set!!!");
        }

        if (logger.isDebugEnabled()) {
            logMapperAndCuboidStatistics(cuboidHLLMap, samplingPercentage, mapperNumber, grantTotal,
                    totalRowsBeforeMerge);
        }
        double mapperOverlapRatio = grantTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grantTotal;
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        long sourceRecordCount = cubingJob.findSourceRecordCount();
        CubeStatsWriter.writeCuboidStatistics(hadoopConf, statisticsDir, cuboidHLLMap, samplingPercentage,
                mapperNumber, mapperOverlapRatio, sourceRecordCount);

        Path statisticsFile = new Path(statisticsDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME);
        logger.info("{} stats saved to hdfs {}", newSegment, statisticsFile);

        FSDataInputStream is = fs.open(statisticsFile);
        try {
            // put the statistics to metadata store
            String resPath = newSegment.getStatisticsResourcePath();
            rs.putResource(resPath, is, System.currentTimeMillis());
            logger.info("{} stats saved to resource {}", newSegment, resPath);

            StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, newSegment);
            StatisticsDecisionUtil.optimizeCubingPlan(newSegment);
        } finally {
            IOUtils.closeStream(is);
        }

        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to save cuboid statistics", e);
        return ExecuteResult.createError(e);
    }
}

Example 17

Source File: UpdateDictionaryStep.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    final CubeManager cubeMgr = CubeManager.getInstance(context.getConfig());
    final DictionaryManager dictMgrHdfs;
    final DictionaryManager dictMgrHbase;
    final CubeInstance cube = cubeMgr.getCube(CubingExecutableUtil.getCubeName(this.getParams()));
    final CubeSegment newSegment = cube.getSegmentById(CubingExecutableUtil.getSegmentId(this.getParams()));
    final List<CubeSegment> mergingSegments = getMergingSegments(cube);
    final String dictInfoPath = this.getParams().get(BatchConstants.ARG_DICT_PATH);
    final String metadataUrl = this.getParams().get(BatchConstants.ARG_META_URL);

    final KylinConfig kylinConfHbase = cube.getConfig();
    final KylinConfig kylinConfHdfs = AbstractHadoopJob.loadKylinConfigFromHdfs(metadataUrl);

    Collections.sort(mergingSegments);

    try {
        Configuration conf = HadoopUtil.getCurrentConfiguration();
        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        ResourceStore hbaseRS = ResourceStore.getStore(kylinConfHbase);
        ResourceStore hdfsRS = ResourceStore.getStore(kylinConfHdfs);
        dictMgrHdfs = DictionaryManager.getInstance(kylinConfHdfs);
        dictMgrHbase = DictionaryManager.getInstance(kylinConfHbase);

        // work on copy instead of cached objects
        CubeInstance cubeCopy = cube.latestCopyForWrite();
        CubeSegment newSegCopy = cubeCopy.getSegmentById(newSegment.getUuid());

        // update cube segment dictionary

        FileStatus[] fileStatuss = fs.listStatus(new Path(dictInfoPath), new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return path.getName().startsWith("part") || path.getName().startsWith("tmp");
            }
        });

        for (FileStatus fileStatus : fileStatuss) {
            Path filePath = fileStatus.getPath();

            SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, conf);
            Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            Text value = (Text) ReflectionUtils.newInstance(reader.getValueClass(), conf);

            while (reader.next(key, value)) {
                String tblCol = key.toString();
                String dictInfoResource = value.toString();

                if (StringUtils.isNotEmpty(dictInfoResource)) {
                    logger.info(dictInfoResource);
                    // put dictionary file to metadata store
                    DictionaryInfo dictInfoHdfs = dictMgrHdfs.getDictionaryInfo(dictInfoResource);
                    DictionaryInfo dicInfoHbase = dictMgrHbase.trySaveNewDict(dictInfoHdfs.getDictionaryObject(), dictInfoHdfs);

                    if (dicInfoHbase != null){
                        TblColRef tblColRef = cube.getDescriptor().findColumnRef(tblCol.split(":")[0], tblCol.split(":")[1]);
                        newSegCopy.putDictResPath(tblColRef, dicInfoHbase.getResourcePath());
                    }
                }
            }

            IOUtils.closeStream(reader);
        }

        CubeSegment lastSeg = mergingSegments.get(mergingSegments.size() - 1);
        for (Map.Entry<String, String> entry : lastSeg.getSnapshots().entrySet()) {
            newSegCopy.putSnapshotResPath(entry.getKey(), entry.getValue());
        }

        // update statistics
        // put the statistics to metadata store
        String statisticsFileName = newSegment.getStatisticsResourcePath();
        hbaseRS.putResource(statisticsFileName, hdfsRS.getResource(newSegment.getStatisticsResourcePath()).content(), System.currentTimeMillis());

        CubeUpdate update = new CubeUpdate(cubeCopy);
        update.setToUpdateSegs(newSegCopy);
        cubeMgr.updateCube(update);

        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to merge dictionary", e);
        return ExecuteResult.createError(e);
    }
}

Example 18

Source File: SaveStatisticsStep.java From kylin with Apache License 2.0

4 votes

@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    CubeSegment newSegment = CubingExecutableUtil.findSegment(context,
            CubingExecutableUtil.getCubeName(this.getParams()),
            CubingExecutableUtil.getSegmentId(this.getParams()));
    KylinConfig kylinConf = newSegment.getConfig();

    ResourceStore rs = ResourceStore.getStore(kylinConf);
    try {

        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        Configuration hadoopConf = HadoopUtil.getCurrentConfiguration();
        Path statisticsDir = new Path(CubingExecutableUtil.getStatisticsPath(this.getParams()));
        Path[] statisticsFiles = HadoopUtil.getFilteredPath(fs, statisticsDir,
                BatchConstants.CFG_OUTPUT_STATISTICS);
        if (statisticsFiles == null) {
            throw new IOException("fail to find the statistics file in base dir: " + statisticsDir);
        }

        Map<Long, HLLCounter> cuboidHLLMap = Maps.newHashMap();
        long totalRowsBeforeMerge = 0;
        long grantTotal = 0;
        int samplingPercentage = -1;
        int mapperNumber = -1;
        for (Path item : statisticsFiles) {
            CubeStatsReader.CubeStatsResult cubeStatsResult = new CubeStatsReader.CubeStatsResult(item,
                    kylinConf.getCubeStatsHLLPrecision());
            cuboidHLLMap.putAll(cubeStatsResult.getCounterMap());
            long pGrantTotal = 0L;
            for (HLLCounter hll : cubeStatsResult.getCounterMap().values()) {
                pGrantTotal += hll.getCountEstimate();
            }
            totalRowsBeforeMerge += pGrantTotal * cubeStatsResult.getMapperOverlapRatio();
            grantTotal += pGrantTotal;
            int pMapperNumber = cubeStatsResult.getMapperNumber();
            if (pMapperNumber > 0) {
                if (mapperNumber < 0) {
                    mapperNumber = pMapperNumber;
                } else {
                    throw new RuntimeException(
                            "Base cuboid has been distributed to multiple reducers at step FactDistinctColumnsReducer!!!");
                }
            }
            int pSamplingPercentage = cubeStatsResult.getPercentage();
            if (samplingPercentage < 0) {
                samplingPercentage = pSamplingPercentage;
            } else if (samplingPercentage != pSamplingPercentage) {
                throw new RuntimeException(
                        "The sampling percentage should be same among all of the reducer of FactDistinctColumnsReducer!!!");
            }
        }
        if (samplingPercentage < 0) {
            logger.warn("The sampling percentage should be set!!!");
        }
        if (mapperNumber < 0) {
            logger.warn("The mapper number should be set!!!");
        }

        if (logger.isDebugEnabled()) {
            logMapperAndCuboidStatistics(cuboidHLLMap, samplingPercentage, mapperNumber, grantTotal,
                    totalRowsBeforeMerge);
        }
        double mapperOverlapRatio = grantTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grantTotal;
        CubingJob cubingJob = (CubingJob) getManager()
                .getJob(CubingExecutableUtil.getCubingJobId(this.getParams()));
        long sourceRecordCount = cubingJob.findSourceRecordCount();
        CubeStatsWriter.writeCuboidStatistics(hadoopConf, statisticsDir, cuboidHLLMap, samplingPercentage,
                mapperNumber, mapperOverlapRatio, sourceRecordCount);

        Path statisticsFile = new Path(statisticsDir, BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME);
        logger.info("{} stats saved to hdfs {}", newSegment, statisticsFile);

        FSDataInputStream is = fs.open(statisticsFile);
        try {
            // put the statistics to metadata store
            String resPath = newSegment.getStatisticsResourcePath();
            rs.putResource(resPath, is, System.currentTimeMillis());
            logger.info("{} stats saved to resource {}", newSegment, resPath);

            StatisticsDecisionUtil.decideCubingAlgorithm(cubingJob, newSegment);
            StatisticsDecisionUtil.optimizeCubingPlan(newSegment);
        } finally {
            IOUtils.closeStream(is);
        }

        return ExecuteResult.createSucceed();
    } catch (IOException e) {
        logger.error("fail to save cuboid statistics", e);
        return ExecuteResult.createError(e);
    }
}

Example 19

Source File: BuildCubeWithEngine.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

public static void beforeClass(String confDir) throws Exception {
    logger.info("Adding to classpath: " + new File(confDir).getAbsolutePath());
    ClassUtil.addClasspath(new File(confDir).getAbsolutePath());

    fastBuildMode = isFastBuildMode();
    simpleBuildMode = isSimpleBuildMode();
    if (fastBuildMode) {
        logger.info("Will use fast build mode");
    }
    if (simpleBuildMode) {
        logger.info("Will use simple build mode");
    }

    String specifiedEngineType = System.getProperty("engineType");
    if (StringUtils.isNotEmpty(specifiedEngineType)) {
        engineType = Integer.parseInt(specifiedEngineType);
    }

    System.setProperty(KylinConfig.KYLIN_CONF, confDir);
    System.setProperty("SPARK_HOME", "/usr/local/spark"); // need manually create and put spark to this folder on Jenkins
    System.setProperty("kylin.hadoop.conf.dir", confDir);
    if (StringUtils.isEmpty(System.getProperty("hdp.version"))) {
        throw new RuntimeException(
                "No hdp.version set; Please set hdp.version in your jvm option, for example: -Dhdp.version=2.4.0.0-169");
    }

    HBaseMetadataTestCase.staticCreateTestMetadata(confDir);

    try {
        //check hdfs permission
        FileSystem fileSystem = HadoopUtil.getWorkingFileSystem();
        String hdfsWorkingDirectory = KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory();
        Path coprocessorDir = new Path(hdfsWorkingDirectory);
        boolean success = fileSystem.mkdirs(coprocessorDir);
        if (!success) {
            throw new IOException("mkdir fails");
        }
    } catch (IOException e) {
        throw new RuntimeException(
                "failed to create kylin.env.hdfs-working-dir, Please make sure the user has right to access "
                        + KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory(),
                e);
    }
}

Example 20

Source File: FlinkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    boolean enableObjectReuse = false;
    if (enableObjectReuseOptValue != null && !enableObjectReuseOptValue.isEmpty()) {
        enableObjectReuse = true;
    }

    Job job = Job.getInstance();
    FileSystem fs = HadoopUtil.getWorkingFileSystem();
    HadoopUtil.deletePath(job.getConfiguration(), new Path(outputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("DataSet input path : {}", inputPath);
    logger.info("DataSet output path : {}", outputPath);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }

    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }

    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (enableObjectReuse) {
        env.getConfig().enableObjectReuse();
    }
    env.getConfig().registerKryoType(PercentileCounter.class);
    env.getConfig().registerTypeWithKryoSerializer(PercentileCounter.class, PercentileCounterSerializer.class);

    DataSet<String[]> hiveDataSet = FlinkUtil.readHiveRecords(isSequenceFile, env, inputPath, hiveTable, job);

    DataSet<Tuple2<ByteArray, Object[]>> encodedBaseDataSet = hiveDataSet.mapPartition(
            new EncodeBaseCuboidMapPartitionFunction(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isFlinkSanityCheckEnabled()) {
        totalCount = encodedBaseDataSet.count();
    }

    final BaseCuboidReduceGroupFunction baseCuboidReducerFunction = new BaseCuboidReduceGroupFunction(cubeName, metaUrl, sConf);

    BaseCuboidReduceGroupFunction reducerFunction = baseCuboidReducerFunction;
    if (!allNormalMeasure) {
        reducerFunction = new CuboidReduceGroupFunction(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    DataSet<Tuple2<ByteArray, Object[]>>[] allDataSets = new DataSet[totalLevels + 1];
    int level = 0;

    // aggregate to calculate base cuboid
    allDataSets[0] = encodedBaseDataSet.groupBy(0).reduceGroup(baseCuboidReducerFunction);

    sinkToHDFS(allDataSets[0], metaUrl, cubeName, cubeSegment, outputPath, 0, Job.getInstance(), envConfig);

    CuboidMapPartitionFunction mapPartitionFunction = new CuboidMapPartitionFunction(cubeName, segmentId, metaUrl, sConf);

    for (level = 1; level <= totalLevels; level++) {
        allDataSets[level] = allDataSets[level - 1].mapPartition(mapPartitionFunction).groupBy(0).reduceGroup(reducerFunction);
        if (envConfig.isFlinkSanityCheckEnabled()) {
            sanityCheck(allDataSets[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        sinkToHDFS(allDataSets[level], metaUrl, cubeName, cubeSegment, outputPath, level, Job.getInstance(), envConfig);
    }

    env.execute("Cubing for : " + cubeName + " segment " + segmentId);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + FlinkBatchCubingJobBuilder2.getFileSize(outputPath, fs));
}