org.apache.kylin.common.util.HadoopUtil Java Examples

The following examples show how to use org.apache.kylin.common.util.HadoopUtil. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NSparkUpdateMetaAndCleanupAfterMergeStep.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    String cubeId = getParam(MetadataConstants.P_CUBE_ID);
    String[] segments = StringUtils.split(getParam(MetadataConstants.P_SEGMENT_NAMES), ",");
    KylinConfig config = KylinConfig.getInstanceFromEnv();
    CubeInstance cube = CubeManager.getInstance(config).getCubeByUuid(cubeId);

    updateMetadataAfterMerge(cubeId);

    for (String segmentName : segments) {
        String path = config.getHdfsWorkingDirectory() + cube.getProject() + "/parquet/" + cube.getName() + "/" + segmentName;
        try {
            HadoopUtil.deletePath(HadoopUtil.getCurrentConfiguration(), new Path(path));
        } catch (IOException e) {
            throw new ExecuteException("Can not delete segment: " + segmentName + ", in cube: " + cube.getName());
        }
    }

    return ExecuteResult.createSucceed();
}
 
Example #2
Source File: CubeStatsWriterTest.java    From kylin with Apache License 2.0 6 votes vote down vote up
@Test
public void testWrite() throws IOException {
    Configuration conf = HadoopUtil.getCurrentConfiguration();
    conf.set("fs.defaultFS", "file:///");
    conf.set("mapreduce.framework.name", "local");
    conf.set("mapreduce.application.framework.path", "");
    conf.set("fs.file.impl.disable.cache", "true");

    final Path outputPath = new Path(getTmpFolderPath(), segmentId);

    System.out.println(outputPath);
    Map<Long, HLLCounter> cuboidHLLMap = Maps.newHashMap();

    Set<Long> allCuboids = cube.getDescriptor().getAllCuboids();
    for (Long cuboid : allCuboids) {
        cuboidHLLMap.put(cuboid, createMockHLLCounter());
    }
    CubeStatsWriter.writeCuboidStatistics(conf, outputPath, cuboidHLLMap, 100);
    assertTrue(new File(outputPath.toString(), BatchConstants.CFG_STATISTICS_CUBOID_ESTIMATION_FILENAME).exists());
}
 
Example #3
Source File: StreamingServer.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public void flushToHDFS() throws IOException {
    logger.info("start to flush cube:{} segment:{} to hdfs:{}", segment.getCubeName(),
            segment.getSegmentName(), hdfsPath);
    final FileSystem fs = HadoopUtil.getFileSystem(hdfsPath);
    final String localPath = segment.getDataSegmentFolder().getPath();
    final Path remotePath = new Path(hdfsPath);
    if (fs.exists(remotePath)) {
        logger.info("the remote path:{} is already exist, skip copy data to remote", remotePath);
        return;
    }
    final Path remoteTempPath = new Path(hdfsPath + ".tmp");
    if (fs.exists(remoteTempPath)) {
        FileStatus sdst = fs.getFileStatus(remoteTempPath);
        if (sdst.isDirectory()) {
            logger.warn("target temp path: {} is an existed directory, try to delete it.", remoteTempPath);
            fs.delete(remoteTempPath, true);
            logger.warn("target temp path: {} is deleted.", remoteTempPath);
        }
    }
    fs.copyFromLocalFile(new Path(localPath), remoteTempPath);
    logger.info("data copy to remote temp path:{}", remoteTempPath);
    boolean renamed = fs.rename(remoteTempPath, remotePath);
    if (renamed) {
        logger.info("successfully rename the temp path to:{}", remotePath);
    }
}
 
Example #4
Source File: ITMassInQueryTest.java    From kylin with Apache License 2.0 6 votes vote down vote up
@Before
public void setup() throws Exception {

    ITKylinQueryTest.clean();
    ITKylinQueryTest.joinType = "left";
    ITKylinQueryTest.setupAll();

    fileSystem = HadoopUtil.getWorkingFileSystem();

    int sellerCount = 200;
    Random r = new Random();
    vipSellers = Sets.newHashSet();
    for (int i = 0; i < sellerCount; i++) {
        vipSellers.add(10000000L + r.nextInt(1500));
    }

    Path path = new Path("/tmp/vip_customers.txt");
    fileSystem.delete(path, false);
    FSDataOutputStream outputStream = fileSystem.create(path);
    org.apache.commons.io.IOUtils.write(StringUtils.join(vipSellers, "\n"), outputStream, Charset.defaultCharset());
    outputStream.close();

    System.out.println("The filter is " + vipSellers);
}
 
Example #5
Source File: IICLI.java    From Kylin with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
	Configuration hconf = HadoopUtil.getCurrentConfiguration();
	IIManager mgr = IIManager.getInstance(KylinConfig.getInstanceFromEnv());

	String iiName = args[0];
	IIInstance ii = mgr.getII(iiName);

	String path = args[1];
	System.out.println("Reading from " + path + " ...");

	TableRecordInfo info = new TableRecordInfo(ii.getFirstSegment());
	IIKeyValueCodec codec = new IIKeyValueCodec(info.getDigest());
	int count = 0;
	for (Slice slice : codec.decodeKeyValue(readSequenceKVs(hconf, path))) {
		for (RawTableRecord rec : slice) {
			System.out.printf(new TableRecord(rec, info).toString());
			count++;
		}
	}
	System.out.println("Total " + count + " records");
}
 
Example #6
Source File: CubeStatsReader.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
/**
 * @param cuboidScheduler if it's null, part of it's functions will not be supported
 */
public CubeStatsReader(CubeSegment cubeSegment, CuboidScheduler cuboidScheduler, KylinConfig kylinConfig)
        throws IOException {
    ResourceStore store = ResourceStore.getStore(kylinConfig);
    String statsKey = cubeSegment.getStatisticsResourcePath();
    RawResource resource = store.getResource(statsKey);
    if (resource == null)
        throw new IllegalStateException("Missing resource at " + statsKey);

    File tmpSeqFile = writeTmpSeqFile(resource.content());
    Path path = new Path(HadoopUtil.fixWindowsPath("file://" + tmpSeqFile.getAbsolutePath()));

    CubeStatsResult cubeStatsResult = new CubeStatsResult(path, kylinConfig.getCubeStatsHLLPrecision());
    tmpSeqFile.delete();

    this.seg = cubeSegment;
    this.cuboidScheduler = cuboidScheduler;
    this.samplingPercentage = cubeStatsResult.getPercentage();
    this.mapperNumberOfFirstBuild = cubeStatsResult.getMapperNumber();
    this.mapperOverlapRatioOfFirstBuild = cubeStatsResult.getMapperOverlapRatio();
    this.cuboidRowEstimatesHLL = cubeStatsResult.getCounterMap();
    this.sourceRowCount = cubeStatsResult.getSourceRecordCount();
}
 
Example #7
Source File: CubeStatsReader.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public CubeStatsResult(Path path, int precision) throws IOException {
    Configuration hadoopConf = HadoopUtil.getCurrentConfiguration();
    Option seqInput = SequenceFile.Reader.file(path);
    try (Reader reader = new SequenceFile.Reader(hadoopConf, seqInput)) {
        LongWritable key = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), hadoopConf);
        BytesWritable value = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), hadoopConf);
        while (reader.next(key, value)) {
            if (key.get() == 0L) {
                percentage = Bytes.toInt(value.getBytes());
            } else if (key.get() == -1) {
                mapperOverlapRatio = Bytes.toDouble(value.getBytes());
            } else if (key.get() == -2) {
                mapperNumber = Bytes.toInt(value.getBytes());
            } else if (key.get() == -3) {
                sourceRecordCount = Bytes.toLong(value.getBytes());
            } else if (key.get() > 0) {
                HLLCounter hll = new HLLCounter(precision);
                ByteArray byteArray = new ByteArray(value.getBytes());
                hll.readRegisters(byteArray.asBuffer());
                counterMap.put(key.get(), hll);
            }
        }
    }
}
 
Example #8
Source File: StreamingServer.java    From kylin with Apache License 2.0 6 votes vote down vote up
public void flushToHDFS() throws IOException {
    logger.info("start to flush cube:{} segment:{} to hdfs:{}", segment.getCubeName(),
            segment.getSegmentName(), hdfsPath);
    final FileSystem fs = HadoopUtil.getFileSystem(hdfsPath);
    final String localPath = segment.getDataSegmentFolder().getPath();
    final Path remotePath = new Path(hdfsPath);
    if (fs.exists(remotePath)) {
        logger.info("the remote path:{} is already exist, skip copy data to remote", remotePath);
        return;
    }
    final Path remoteTempPath = new Path(hdfsPath + ".tmp");
    if (fs.exists(remoteTempPath)) {
        FileStatus sdst = fs.getFileStatus(remoteTempPath);
        if (sdst.isDirectory()) {
            logger.warn("target temp path: {} is an existed directory, try to delete it.", remoteTempPath);
            fs.delete(remoteTempPath, true);
            logger.warn("target temp path: {} is deleted.", remoteTempPath);
        }
    }
    fs.copyFromLocalFile(new Path(localPath), remoteTempPath);
    logger.info("data copy to remote temp path:{}", remoteTempPath);
    boolean renamed = fs.rename(remoteTempPath, remotePath);
    if (renamed) {
        logger.info("successfully rename the temp path to:{}", remotePath);
    }
}
 
Example #9
Source File: HDFSResourceStore.java    From kylin with Apache License 2.0 6 votes vote down vote up
public HDFSResourceStore(KylinConfig kylinConfig, StorageURL metadataUrl) throws Exception {
    super(kylinConfig);
    Preconditions.checkState(HDFS_SCHEME.equals(metadataUrl.getScheme()));

    String path = metadataUrl.getParameter("path");
    if (path == null) {
        // missing path is not expected, but don't fail it
        path = kylinConfig.getHdfsWorkingDirectory(null) + "tmp_metadata";
        logger.warn("Missing path, fall back to {}. ", path);
    }

    fs = HadoopUtil.getFileSystem(path);
    Path metadataPath = new Path(path);
    if (!fs.exists(metadataPath)) {
        logger.warn("Path not exist in HDFS, create it: {}. ", path);
        createMetaFolder(metadataPath);
    }

    hdfsMetaPath = metadataPath;
    logger.info("hdfs meta path : {}", hdfsMetaPath);

}
 
Example #10
Source File: KylinHealthCheckJob.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void checkSegmentHDFSPath(List<CubeInstance> cubes) throws IOException {
    reporter.log("## Fix missing HDFS path of segments");
    FileSystem defaultFs = HadoopUtil.getWorkingFileSystem();
    for (CubeInstance cube : cubes) {
        for (CubeSegment segment : cube.getSegments()) {
            String jobUuid = segment.getLastBuildJobID();
            if (jobUuid != null && jobUuid.equals("") == false) {
                String path = JobBuilderSupport.getJobWorkingDir(config.getHdfsWorkingDirectory(), jobUuid);
                if (!defaultFs.exists(new Path(path))) {
                    reporter.log(
                            "Project: {} cube: {} segment: {} cube id data: {} don't exist and need to rebuild it",
                            cube.getProject(), cube.getName(), segment, path);
                    reporter.log(
                            "The rebuild url: -d '{\"startTime\":{}, \"endTime\":{}, \"buildType\":\"REFRESH\"}' /kylin/api/cubes/{}/build",
                            segment.getTSRange().start, segment.getTSRange().end, cube.getName());
                }
            }
        }
    }
}
 
Example #11
Source File: HiveTable.java    From Kylin with Apache License 2.0 6 votes vote down vote up
private String computeHDFSLocation(boolean needFilePath) throws IOException {

        String override = KylinConfig.getInstanceFromEnv().getOverrideHiveTableLocation(hiveTable);
        if (override != null) {
            logger.debug("Override hive table location " + hiveTable + " -- " + override);
            return override;
        }
        
        String hdfsDir = null;
        try {
            HiveClient hiveClient = new HiveClient();
            hdfsDir = hiveClient.getHiveTableLocation(database, hiveTable);
        } catch (Exception e) {
            e.printStackTrace();
            throw new IOException(e);
        }

        if (needFilePath) {
            FileSystem fs = HadoopUtil.getFileSystem(hdfsDir);
            FileStatus file = findOnlyFile(hdfsDir, fs);
            return file.getPath().toString();
        } else {
            return hdfsDir;
        }

    }
 
Example #12
Source File: CubeService.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private void cleanSegmentStorage(List<CubeSegment> toRemoveSegs) throws IOException {
    if (!KylinConfig.getInstanceFromEnv().cleanStorageAfterDelOperation()) {
        return;
    }

    if (toRemoveSegs != null && !toRemoveSegs.isEmpty()) {
        List<String> toDropHTables = Lists.newArrayListWithCapacity(toRemoveSegs.size());
        List<String> toDelHDFSPaths = Lists.newArrayListWithCapacity(toRemoveSegs.size());
        for (CubeSegment seg : toRemoveSegs) {
            toDropHTables.add(seg.getStorageLocationIdentifier());
            toDelHDFSPaths.add(JobBuilderSupport.getJobWorkingDir(seg.getConfig().getHdfsWorkingDirectory(),
                    seg.getLastBuildJobID()));
        }

        StorageCleanUtil.dropHTables(new HBaseAdmin(HBaseConnection.getCurrentHBaseConfiguration()), toDropHTables);
        StorageCleanUtil.deleteHDFSPath(HadoopUtil.getWorkingFileSystem(), toDelHDFSPaths);
    }
}
 
Example #13
Source File: HDFSPathGarbageCollectionStep.java    From kylin with Apache License 2.0 6 votes vote down vote up
@Override
protected ExecuteResult doWork(ExecutableContext context) throws ExecuteException {
    try {
        config = new JobEngineConfig(context.getConfig());
        List<String> toDeletePaths = getDeletePaths();
        dropHdfsPathOnCluster(toDeletePaths, HadoopUtil.getWorkingFileSystem());

        if (StringUtils.isNotEmpty(context.getConfig().getHBaseClusterFs())) {
            dropHdfsPathOnCluster(toDeletePaths, FileSystem.get(HBaseConnection.getCurrentHBaseConfiguration()));
        }
    } catch (IOException e) {
        logger.error("job:" + getId() + " execute finished with exception", e);
        output.append("\n").append(e.getLocalizedMessage());
    }

    return new ExecuteResult(ExecuteResult.State.SUCCEED, output.toString());
}
 
Example #14
Source File: HBaseConnection.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public static String makeQualifiedPathInHBaseCluster(String inPath) {
    Path path = new Path(inPath);
    path = Path.getPathWithoutSchemeAndAuthority(path);

    FileSystem fs = HadoopUtil.getFileSystem(path, getCurrentHBaseConfiguration()); // Must be HBase's FS, not working FS
    return fs.makeQualified(path).toString();
}
 
Example #15
Source File: DstClusterUtil.java    From kylin with Apache License 2.0 5 votes vote down vote up
public static void copyInit(FileSystem fs, Path path) throws IOException {
    path = Path.getPathWithoutSchemeAndAuthority(path);
    Path pathP = path.getParent();
    if (!fs.exists(pathP)) {
        fs.mkdirs(pathP);
    }
    if (fs.exists(path)) {
        logger.warn("path {} already existed and will be deleted", path);
        HadoopUtil.deletePath(fs.getConf(), path);
    }
}
 
Example #16
Source File: GarbageCollectionStep.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private void rmdirOnHDFS(List<String> paths) throws IOException {
    for (String path : paths) {
        Path externalDataPath = new Path(path);
        FileSystem fs = HadoopUtil.getWorkingFileSystem();
        if (fs.exists(externalDataPath)) {
            fs.delete(externalDataPath, true);
        }
    }
}
 
Example #17
Source File: SparkUHCDictionary.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<Integer, List<String>> call(String sequenceFilePath) throws Exception {
    Path path = new Path(sequenceFilePath);
    logger.info("Column absolute path is " + path.toString());
    if (!HadoopUtil.getFileSystem(path).exists(path)) {
        return new Tuple2<>(-1, null);
    }

    String columnName = path.getParent().getName();
    int index = -1;
    for (int i = 0;i < uhcColumns.size(); i++) {
        if (uhcColumns.get(i).getIdentity().equalsIgnoreCase(columnName)) {
            index = i;
            break;
        }
    }

    if (index == -1) {
        return new Tuple2<>(-1, null);
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(config)) {
        List<String> values = Lists.newArrayList();
        values.addAll(HadoopUtil.readDistinctColumnValues(sequenceFilePath));

        logger.info("UHC column " + columnName + " contains distinct values " + values);

        return new Tuple2<>(index, values);
    }
}
 
Example #18
Source File: SparkApplication.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
protected Boolean hasCountDistinct() throws IOException {
    Path countDistinct = new Path(config.getJobTmpShareDir(project, jobId),
            ResourceDetectUtils.countDistinctSuffix());
    FileSystem fileSystem = countDistinct.getFileSystem(HadoopUtil.getCurrentConfiguration());
    Boolean exist;
    if (fileSystem.exists(countDistinct)) {
        exist = ResourceDetectUtils.readResourcePathsAs(countDistinct);
    } else {
        exist = false;
        logger.info("File count_distinct.json doesn't exist, set hasCountDistinct to false.");
    }
    logger.info("Exist count distinct measure: {}", exist);
    return exist;
}
 
Example #19
Source File: SparkUHCDictionary.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<Integer, List<String>> call(String sequenceFilePath) throws Exception {
    Path path = new Path(sequenceFilePath);
    logger.info("Column absolute path is " + path.toString());
    if (!HadoopUtil.getFileSystem(path).exists(path)) {
        return new Tuple2<>(-1, null);
    }

    String columnName = path.getParent().getName();
    int index = -1;
    for (int i = 0;i < uhcColumns.size(); i++) {
        if (uhcColumns.get(i).getIdentity().equalsIgnoreCase(columnName)) {
            index = i;
            break;
        }
    }

    if (index == -1) {
        return new Tuple2<>(-1, null);
    }

    try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig
            .setAndUnsetThreadLocalConfig(config)) {
        List<String> values = Lists.newArrayList();
        values.addAll(HadoopUtil.readDistinctColumnValues(sequenceFilePath));

        logger.info("UHC column " + columnName + " contains distinct values " + values);

        return new Tuple2<>(index, values);
    }
}
 
Example #20
Source File: HBaseConnection.java    From kylin with Apache License 2.0 5 votes vote down vote up
private static Configuration newHBaseConfiguration(StorageURL url) {
    // using a hbase:xxx URL is deprecated, instead hbase config is always loaded from hbase-site.xml in classpath
    if (!"hbase".equals(url.getScheme()))
        throw new IllegalArgumentException("to use hbase storage, pls set 'kylin.storage.url=hbase' in kylin.properties");

    Configuration conf = HBaseConfiguration.create(HadoopUtil.getCurrentConfiguration());
    addHBaseClusterNNHAConfiguration(conf);

    // support hbase using a different FS
    KylinConfig kylinConf = KylinConfig.getInstanceFromEnv();
    String hbaseClusterFs = kylinConf.getHBaseClusterFs();
    if (StringUtils.isNotEmpty(hbaseClusterFs)) {
        conf.set(FileSystem.FS_DEFAULT_NAME_KEY, hbaseClusterFs);
    } else {
        try {
            FileSystem fs = HadoopUtil.getWorkingFileSystem(HadoopUtil.getCurrentConfiguration());
            conf.set(FileSystem.FS_DEFAULT_NAME_KEY, fs.getUri().toString());
            logger.debug("Using the working dir FS for HBase: " + fs.getUri().toString());
        } catch (IOException e) {
            logger.error("Fail to set working dir to HBase configuration", e);
        }
    }

    // https://issues.apache.org/jira/browse/KYLIN-953
    if (StringUtils.isBlank(conf.get("hadoop.tmp.dir"))) {
        conf.set("hadoop.tmp.dir", "/tmp");
    }
    if (StringUtils.isBlank(conf.get("hbase.fs.tmp.dir"))) {
        conf.set("hbase.fs.tmp.dir", "/tmp");
    }
    
    for (Entry<String, String> entry : url.getAllParameters().entrySet()) {
        conf.set(entry.getKey(), entry.getValue());
    }

    return conf;
}
 
Example #21
Source File: HDFSResourceStoreTest.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Test
public void testListResourcesImpl() throws Exception {
    String path = "../examples/test_metadata/";
    String cp = new File(path).getCanonicalFile().getPath();
    FileSystem fs = HadoopUtil.getFileSystem(cp);
    HDFSResourceStore store = new HDFSResourceStore(KylinConfig.getInstanceFromEnv(),
            StorageURL.valueOf("hdfs@hdfs"));
    Field field = store.getClass().getDeclaredField("fs");
    field.setAccessible(true);
    field.set(store, fs);

    File f1 = new File(cp + "/resource/resource/e1.json");
    File f2 = new File(cp + "/resource/resource/e2.json");
    if (!f1.getParentFile().exists()) {
        if (!f1.getParentFile().mkdirs()) {
            throw new RuntimeException("Can not create dir.");
        }
    }
    if (!(f1.createNewFile() && f2.createNewFile())) {
        throw new RuntimeException("Can not create file.");
    }

    Path p = new Path(cp);
    TreeSet<String> resources = store.getAllFilePath(new Path(p, "resource"), "/resource/");
    TreeSet<String> expected = new TreeSet<>();
    expected.add("/resource/resource/e1.json");
    expected.add("/resource/resource/e2.json");
    Assert.assertEquals(expected, resources);
}
 
Example #22
Source File: Coordinator.java    From kylin with Apache License 2.0 5 votes vote down vote up
private void removeHDFSFiles(String cubeName, String segmentName) {
    String segmentHDFSPath = HDFSUtil.getStreamingSegmentFilePath(cubeName, segmentName);
    try {
        FileSystem fs = HadoopUtil.getFileSystem(segmentHDFSPath);
        fs.delete(new Path(segmentHDFSPath), true);
    } catch (Exception e) {
        logger.error("error when remove hdfs file, hdfs path:{}", segmentHDFSPath);
    }
}
 
Example #23
Source File: NDCuboidJobTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() throws Exception {
    conf = HadoopUtil.getCurrentConfiguration();
    conf.set("fs.default.name", "file:///");
    conf.set("mapreduce.framework.name", "local");
    conf.set("mapreduce.application.framework.path", "");

    // for local runner out-of-memory issue
    conf.set("mapreduce.task.io.sort.mb", "10");

    createTestMetadata();
}
 
Example #24
Source File: CubingJob.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private double getRealSizeByLevel(String rootPath, int level) {
    try {
        String levelPath = JobBuilderSupport.getCuboidOutputPathsByLevel(rootPath, level);
        FileSystem fs = HadoopUtil.getFileSystem(levelPath);
        return fs.getContentSummary(new Path(levelPath)).getLength() / (1024L * 1024L);
    } catch (Exception e) {
        logger.warn("get level real size failed." + e);
        return 0L;
    }
}
 
Example #25
Source File: CubeService.java    From Kylin with Apache License 2.0 5 votes vote down vote up
/**
 * Generate cardinality for table This will trigger a hadoop job
 * The result will be merged into table exd info
 *
 * @param tableName
 */
public void calculateCardinality(String tableName, String submitter) {
    String[] dbTableName = HadoopUtil.parseHiveTableName(tableName);
    tableName = dbTableName[0] + "." + dbTableName[1];
    TableDesc table = getMetadataManager().getTableDesc(tableName);
    final Map<String, String> tableExd = getMetadataManager().getTableDescExd(tableName);
    if (tableExd == null || table == null) {
        IllegalArgumentException e = new IllegalArgumentException("Cannot find table descirptor " + tableName);
        logger.error("Cannot find table descirptor " + tableName, e);
        throw e;
    }

    DefaultChainedExecutable job = new DefaultChainedExecutable();
    job.setName("Hive Column Cardinality calculation for table '" + tableName + "'");
    job.setSubmitter(submitter);

    String outPath = HiveColumnCardinalityJob.OUTPUT_PATH + "/" + tableName;
    String param = "-table " + tableName + " -output " + outPath;

    HadoopShellExecutable step1 = new HadoopShellExecutable();

    step1.setJobClass(HiveColumnCardinalityJob.class);
    step1.setJobParams(param);

    job.addTask(step1);

    HadoopShellExecutable step2 = new HadoopShellExecutable();

    step2.setJobClass(HiveColumnCardinalityUpdateJob.class);
    step2.setJobParams(param);
    job.addTask(step2);

    getExecutableManager().addJob(job);
}
 
Example #26
Source File: HiveColumnCardinalityUpdateJob.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private static List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = HadoopUtil.getWorkingFileSystem();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        for (String str : StringUtil.split(raw, "\n")) {
            results.add(str);
        }
    }
    return results;
}
 
Example #27
Source File: AbstractHadoopJob.java    From kylin with Apache License 2.0 5 votes vote down vote up
public static int addInputDirs(String[] inputs, Job job) throws IOException {
    int ret = 0;//return number of added folders
    for (String inp : inputs) {
        inp = inp.trim();
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
            Path path = new Path(inp);

            if (!exists(fs, path)) {
                logger.warn("Path not exist:" + path.toString());
                continue;
            }

            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                ret += addInputDirs(new String[] { path.toString() }, job);
            }
        } else {
            logger.trace("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
            ret++;
        }
    }
    return ret;
}
 
Example #28
Source File: AbstractHadoopJob.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public static int addInputDirs(String[] inputs, Job job) throws IOException {
    int ret = 0;//return number of added folders
    for (String inp : inputs) {
        inp = inp.trim();
        if (inp.endsWith("/*")) {
            inp = inp.substring(0, inp.length() - 2);
            FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
            Path path = new Path(inp);

            if (!exists(fs, path)) {
                logger.warn("Path not exist:" + path.toString());
                continue;
            }

            FileStatus[] fileStatuses = fs.listStatus(path);
            boolean hasDir = false;
            for (FileStatus stat : fileStatuses) {
                if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
                    hasDir = true;
                    ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
                }
            }
            if (fileStatuses.length > 0 && !hasDir) {
                ret += addInputDirs(new String[] { path.toString() }, job);
            }
        } else {
            logger.trace("Add input " + inp);
            FileInputFormat.addInputPath(job, new Path(inp));
            ret++;
        }
    }
    return ret;
}
 
Example #29
Source File: InvertedIndexJob.java    From Kylin with Apache License 2.0 5 votes vote down vote up
private void setupMapper(String intermediateTable) throws IOException {

        String[] dbTableNames = HadoopUtil.parseHiveTableName(intermediateTable);
        HCatInputFormat.setInput(job, dbTableNames[0],
                dbTableNames[1]);
        
        job.setInputFormatClass(HCatInputFormat.class);

        job.setMapperClass(InvertedIndexMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(ImmutableBytesWritable.class);
        job.setPartitionerClass(InvertedIndexPartitioner.class);
    }
 
Example #30
Source File: HBaseMROutput2Transition.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
@Override
public void configureJobOutput(Job job, String output, CubeSegment segment) throws Exception {
    int reducerNum = MapReduceUtil.getLayeredCubingReduceTaskNum(segment, segment.getCuboidScheduler(),
            AbstractHadoopJob.getTotalMapInputMB(job), -1);
    job.setNumReduceTasks(reducerNum);

    Path outputPath = new Path(output);
    HadoopUtil.deletePath(job.getConfiguration(), outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
}