org.apache.hadoop.hive.ql.plan.PartitionDesc Java Examples

The following examples show how to use org.apache.hadoop.hive.ql.plan.PartitionDesc. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
public CombineHiveInputSplit(JobConf job, CombineFileSplit inputSplitShim,
    Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
  this.inputSplitShim = inputSplitShim;
  this.pathToPartitionInfo = pathToPartitionInfo;
  if (job != null) {
    if (this.pathToPartitionInfo == null) {
      this.pathToPartitionInfo = Utilities.getMapWork(job).getPathToPartitionInfo();
    }

    // extract all the inputFormatClass names for each chunk in the
    // CombinedSplit.
    Path[] ipaths = inputSplitShim.getPaths();
    if (ipaths.length > 0) {
      PartitionDesc part = getPartitionFromPath(this.pathToPartitionInfo, ipaths[0],
          IOPrepareCache.get().getPartitionDescMap());
      inputFormatClassName = part.getInputFileFormatClass().getName();
    }
  }
}
 
Example #2
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Writable interface.
 */
@Override
public void write(DataOutput out) throws IOException {
  if (inputFormatClassName == null) {
    if (pathToPartitionInfo == null) {
      pathToPartitionInfo = Utilities.getMapWork(getJob()).getPathToPartitionInfo();
    }

    // extract all the inputFormatClass names for each chunk in the
    // CombinedSplit.
    PartitionDesc part = getPartitionFromPath(pathToPartitionInfo, inputSplitShim.getPath(0),
        IOPrepareCache.get().getPartitionDescMap());

    // create a new InputFormat instance if this is the first time to see
    // this class
    inputFormatClassName = part.getInputFileFormatClass().getName();
  }
  Text.writeString(out, inputFormatClassName);
  if (HoodieParquetRealtimeInputFormat.class.getName().equals(inputFormatClassName)) {
    // Write Shim Class Name
    Text.writeString(out, inputSplitShim.getClass().getName());
  }
  inputSplitShim.write(out);
}
 
Example #3
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Override
public Set<Integer> call() throws Exception {
  Set<Integer> nonCombinablePathIndices = new HashSet<Integer>();
  for (int i = 0; i < length; i++) {
    PartitionDesc part = getPartitionFromPath(pathToPartitionInfo, paths[i + start],
        IOPrepareCache.get().allocatePartitionDescMap());
    // Use HiveInputFormat if any of the paths is not splittable
    Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
    InputFormat<WritableComparable, Writable> inputFormat = getInputFormatFromCache(inputFormatClass, conf);
    if (inputFormat instanceof AvoidSplitCombination
        && ((AvoidSplitCombination) inputFormat).shouldSkipCombine(paths[i + start], conf)) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("The path [" + paths[i + start] + "] is being parked for HiveInputFormat.getSplits");
      }
      nonCombinablePathIndices.add(i + start);
    }
  }
  return nonCombinablePathIndices;
}
 
Example #4
Source File: Hive012Binding.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Initialize the mapWork variable in order to get all the partition and start to update the jobconf
 *
 * @param job
 */
private void init(final JobConf job) {
  final String plan = HiveConf.getVar(job, HiveConf.ConfVars.PLAN);
  if (mapWork == null && plan != null && plan.length() > 0) {
    mapWork = Utilities.getMapWork(job);
    pathToPartitionInfo.clear();
    for (final Map.Entry<String, PartitionDesc> entry : mapWork.getPathToPartitionInfo().entrySet()) {
      pathToPartitionInfo.put(new Path(entry.getKey()).toUri().getPath().toString(), entry.getValue());
    }
  }
}
 
Example #5
Source File: Hive012Binding.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public JobConf pushProjectionsAndFilters(JobConf jobConf, Path path)
    throws IOException {
  init(jobConf);
  final JobConf cloneJobConf = new JobConf(jobConf);
  final PartitionDesc part = pathToPartitionInfo.get(path.toString());

  if ((part != null) && (part.getTableDesc() != null)) {
    Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf);
  }

  pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().toString());
  return cloneJobConf;
}
 
Example #6
Source File: Hive010Binding.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Initialize the mrwork variable in order to get all the partition and start to update the jobconf
 *
 * @param job
 */
private void init(final JobConf job) {
  final String plan = HiveConf.getVar(job, HiveConf.ConfVars.PLAN);
  if (mrwork == null && plan != null && plan.length() > 0) {
    mrwork = Utilities.getMapRedWork(job);
    pathToPartitionInfo.clear();
    for (final Map.Entry<String, PartitionDesc> entry : mrwork.getPathToPartitionInfo().entrySet()) {
      pathToPartitionInfo.put(new Path(entry.getKey()).toUri().getPath().toString(), entry.getValue());
    }
  }
}
 
Example #7
Source File: Hive010Binding.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public JobConf pushProjectionsAndFilters(JobConf jobConf, Path path)
    throws IOException {
  init(jobConf);
  final JobConf cloneJobConf = new JobConf(jobConf);
  final PartitionDesc part = pathToPartitionInfo.get(path.toString());

  if ((part != null) && (part.getTableDesc() != null)) {
    Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf);
  }

  pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().toString());
  return cloneJobConf;
}
 
Example #8
Source File: HiveReaderSetting.java    From multiple-dimension-spread with Apache License 2.0 4 votes vote down vote up
public HiveReaderSetting( final FileSplit split, final JobConf job ){
  config = new Configuration();

  disableSkipBlock = job.getBoolean( "mds.disable.block.skip" , false );
  disableFilterPushdown = job.getBoolean( "mds.disable.filter.pushdown" , false );

  Set<String> pathNameSet= createPathSet( split.getPath() );
  List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>();
  String filterExprSerialized = job.get( TableScanDesc.FILTER_EXPR_CONF_STR );
  if( filterExprSerialized != null ){
    filterExprs.add( Utilities.deserializeExpression(filterExprSerialized) );
  }

  MapWork mapWork;
  try{
    mapWork = Utilities.getMapWork(job);
  }catch( Exception e ){
    mapWork = null;
  }

  if( mapWork == null ){
    node = createExpressionNode( filterExprs );
    isVectorModeFlag = false;
    return;
  }

  node = createExpressionNode( filterExprs );

  for( Map.Entry<String,PartitionDesc> pathsAndParts: mapWork.getPathToPartitionInfo().entrySet() ){
    if( ! pathNameSet.contains( pathsAndParts.getKey() ) ){
      continue;
    }
    Properties props = pathsAndParts.getValue().getTableDesc().getProperties();
    if( props.containsKey( "mds.expand" ) ){
      config.set( "spread.reader.expand.column" , props.getProperty( "mds.expand" ) );
    }
    if( props.containsKey( "mds.flatten" ) ){
      config.set( "spread.reader.flatten.column" , props.getProperty( "mds.flatten" ) );
    }
  }

  config.set( "spread.reader.read.column.names" , createReadColumnNames( job.get( ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR , null ) ) );

  // Next Hive vesion;
  // Utilities.getUseVectorizedInputFileFormat(job)
  isVectorModeFlag = Utilities.isVectorMode( job );
}
 
Example #9
Source File: HoodieCombineRealtimeHiveSplit.java    From hudi with Apache License 2.0 4 votes vote down vote up
public HoodieCombineRealtimeHiveSplit(JobConf jobConf, CombineFileSplit
    combineFileSplit, Map<Path, PartitionDesc> map)
    throws IOException {
  super(jobConf, combineFileSplit, map);
}
 
Example #10
Source File: TestHoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 4 votes vote down vote up
@Test
@Disabled
public void testHoodieRealtimeCombineHoodieInputFormat() throws Exception {

  Configuration conf = new Configuration();
  // initial commit
  Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
  HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
  String commitTime = "100";
  final int numRecords = 1000;
  // Create 3 parquet files with 1000 records each
  File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime);
  InputFormatTestUtil.commit(tempDir, commitTime);

  // insert 1000 update records to log file 0
  String newCommitTime = "101";
  HoodieLogFormat.Writer writer =
      InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", commitTime, newCommitTime,
          numRecords, numRecords, 0);
  writer.close();
  // insert 1000 update records to log file 1
  writer =
      InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", commitTime, newCommitTime,
          numRecords, numRecords, 0);
  writer.close();
  // insert 1000 update records to log file 2
  writer =
      InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime,
          numRecords, numRecords, 0);
  writer.close();

  TableDesc tblDesc = Utilities.defaultTd;
  // Set the input format
  tblDesc.setInputFileFormatClass(HoodieCombineHiveInputFormat.class);
  PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
  LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
  pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
  MapredWork mrwork = new MapredWork();
  mrwork.getMapWork().setPathToPartitionInfo(pt);
  Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
  Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
  jobConf = new JobConf(conf);
  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
  jobConf.set(HAS_MAP_WORK, "true");
  // The following config tells Hive to choose ExecMapper to read the MAP_WORK
  jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
  // setting the split size to be 3 to create one split for 3 file groups
  jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "3");

  HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
  String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
  InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
  InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
  // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
  assertEquals(1, splits.length);
  RecordReader<NullWritable, ArrayWritable> recordReader =
      combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
  NullWritable nullWritable = recordReader.createKey();
  ArrayWritable arrayWritable = recordReader.createValue();
  int counter = 0;
  while (recordReader.next(nullWritable, arrayWritable)) {
    // read over all the splits
    counter++;
  }
  // should read out 3 splits, each for file0, file1, file2 containing 1000 records each
  assertEquals(3000, counter);
}
 
Example #11
Source File: CopybookRecordReader.java    From CopybookInputFormat with Apache License 2.0 4 votes vote down vote up
public CopybookRecordReader(FileSplit genericSplit, JobConf job)
    throws IOException {
  try {
    String cblPath = job.get(Const.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF);

    if (cblPath == null) {
      if (job != null) {
        MapWork mrwork = Utilities.getMapWork(job);

        if (mrwork == null) {
          System.out.println("When running a client side hive job you have to set \"copybook.inputformat.cbl.hdfs.path\" before executing the query." );
          System.out.println("When running a MR job we can get this from the hive TBLProperties" );
        }
        Map<String, PartitionDesc> map = mrwork.getPathToPartitionInfo();
        
        for (Map.Entry<String, PartitionDesc> pathsAndParts : map.entrySet()) {
          System.out.println("Hey");
          Properties props = pathsAndParts.getValue().getProperties();
          cblPath = props
              .getProperty(Const.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF);
          break;
        }
      }
    }

    FileSystem fs = FileSystem.get(job);
    BufferedInputStream inputStream = new BufferedInputStream(
        fs.open(new Path(cblPath)));
    CobolCopybookLoader copybookInt = new CobolCopybookLoader();
    externalRecord = copybookInt
        .loadCopyBook(inputStream, "RR", CopybookLoader.SPLIT_NONE, 0,
            "cp037", Convert.FMT_MAINFRAME, 0, null);

    int fileStructure = Constants.IO_FIXED_LENGTH;

    for (ExternalField field : externalRecord.getRecordFields()) {
      recordByteLength += field.getLen();
    }

    // jump to the point in the split that the first whole record of split
    // starts at
    FileSplit split = (FileSplit) genericSplit;

    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    BufferedInputStream fileIn = new BufferedInputStream(fs.open(split
        .getPath()));

    if (start != 0) {
      pos = start - (start % recordByteLength) + recordByteLength;

      fileIn.skip(pos);
    }

    ret = LineIOProvider.getInstance().getLineReader(
        fileStructure,
        LineIOProvider.getInstance().getLineProvider(fileStructure));

    ret.open(fileIn, externalRecord);
  } catch (Exception e) {
    e.printStackTrace();
  } 

}