org.apache.hadoop.hive.serde2.ColumnProjectionUtils Java Examples

The following examples show how to use org.apache.hadoop.hive.serde2.ColumnProjectionUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveVectorizedReaderSetting.java    From multiple-dimension-spread with Apache License 2.0 6 votes vote down vote up
public HiveVectorizedReaderSetting( final FileSplit split , final JobConf job , final HiveReaderSetting hiveReaderConfig ) throws IOException{
  this.hiveReaderConfig = hiveReaderConfig;

  rbCtx = Utilities.getVectorizedRowBatchCtx( job );
  partitionValues = new Object[rbCtx.getPartitionColumnCount()];
  if( 0 < partitionValues.length ){
    rbCtx.getPartitionValues( rbCtx, job, split, partitionValues );
  }

  TypeInfo[] typeInfos = rbCtx.getRowColumnTypeInfos();
  columnNames = rbCtx.getRowColumnNames();
  needColumnIds = createNeedColumnId( ColumnProjectionUtils.getReadColumnIDs( job ) );

  projectionColumn = new boolean[columnNames.length];
  assignors = new IColumnVectorAssignor[columnNames.length];
  for( int id : needColumnIds ){
    projectionColumn[id] = true;
    assignors[id] = ColumnVectorAssignorFactory.create( typeInfos[id] );
  }
}
 
Example #2
Source File: MDSSerde.java    From multiple-dimension-spread with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize( final Configuration conf, final Properties table , final Properties part ) throws SerDeException{
  LOG.info( table.toString() );
  if( part != null ){
    LOG.info( part.toString() );
  }
  String columnNameProperty = table.getProperty(serdeConstants.LIST_COLUMNS);
  String columnTypeProperty = table.getProperty(serdeConstants.LIST_COLUMN_TYPES);

  String projectionColumnNames = conf.get( ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR , "" );

  StructTypeInfo rootType;
  if( projectionColumnNames.isEmpty() ){
    rootType = getAllReadTypeInfo( columnNameProperty , columnTypeProperty );
  }
  else{
    rootType = getColumnProjectionTypeInfo( columnNameProperty , columnTypeProperty , projectionColumnNames );
  }

  inspector = MDSObjectInspectorFactory.craeteObjectInspectorFromTypeInfo( rootType );
}
 
Example #3
Source File: HoodieParquetRealtimeInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * Add a field to the existing fields projected.
 */
private static Configuration addProjectionField(Configuration conf, String fieldName, int fieldIndex) {
  String readColNames = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "");
  String readColIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "");

  String readColNamesPrefix = readColNames + ",";
  if (readColNames == null || readColNames.isEmpty()) {
    readColNamesPrefix = "";
  }
  String readColIdsPrefix = readColIds + ",";
  if (readColIds == null || readColIds.isEmpty()) {
    readColIdsPrefix = "";
  }

  if (!readColNames.contains(fieldName)) {
    // If not already in the list - then add it
    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, readColNamesPrefix + fieldName);
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIdsPrefix + fieldIndex);
    if (LOG.isDebugEnabled()) {
      LOG.debug(String.format("Adding extra column " + fieldName + ", to enable log merging cols (%s) ids (%s) ",
          conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
          conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)));
    }
  }
  return conf;
}
 
Example #4
Source File: HiveTableInputFormat.java    From flink with Apache License 2.0 6 votes vote down vote up
private void addSchemaToConf(JobConf jobConf) {
	// set columns/types -- including partition cols
	List<String> typeStrs = Arrays.stream(fieldTypes)
			.map(t -> HiveTypeUtil.toHiveTypeInfo(t, true).toString())
			.collect(Collectors.toList());
	jobConf.set(IOConstants.COLUMNS, String.join(",", fieldNames));
	jobConf.set(IOConstants.COLUMNS_TYPES, String.join(",", typeStrs));
	// set schema evolution -- excluding partition cols
	int numNonPartCol = fieldNames.length - partitionKeys.size();
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS, String.join(",", Arrays.copyOfRange(fieldNames, 0, numNonPartCol)));
	jobConf.set(SCHEMA_EVOLUTION_COLUMNS_TYPES, String.join(",", typeStrs.subList(0, numNonPartCol)));

	// in older versions, parquet reader also expects the selected col indices in conf, excluding part cols
	String readColIDs = Arrays.stream(selectedFields)
			.filter(i -> i < numNonPartCol)
			.mapToObj(String::valueOf)
			.collect(Collectors.joining(","));
	jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, readColIDs);
}
 
Example #5
Source File: InputFormatTestUtil.java    From hudi with Apache License 2.0 6 votes vote down vote up
public static void setPropsForInputFormat(JobConf jobConf,
    Schema schema, String hiveColumnTypes) {
  List<Schema.Field> fields = schema.getFields();
  String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
  String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
  Configuration conf = HoodieTestUtils.getDefaultHadoopConf();

  String hiveColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase("datestr"))
      .map(Schema.Field::name).collect(Collectors.joining(","));
  hiveColumnNames = hiveColumnNames + ",datestr";
  String modifiedHiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(hiveColumnTypes);
  modifiedHiveColumnTypes = modifiedHiveColumnTypes + ",string";
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, modifiedHiveColumnTypes);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, modifiedHiveColumnTypes);
  jobConf.addResource(conf);
}
 
Example #6
Source File: OrcInputFormat.java    From hive-dwrf with Apache License 2.0 5 votes vote down vote up
/**
 * Take the configuration and figure out which columns we need to include.
 * @param types the types of the file
 * @param conf the configuration
 * @return true for each column that should be included
 */
private static boolean[] findIncludedColumns(List<OrcProto.Type> types,
                                             Configuration conf) {
  String includedStr =
      conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
  if (includedStr == null || includedStr.trim().length() == 0) {
    return null;
  } else {
    int numColumns = types.size();
    boolean[] result = new boolean[numColumns];
    result[0] = true;
    OrcProto.Type root = types.get(0);
    List<Integer> included = ColumnProjectionUtils.getReadColumnIDs(conf);
    for(int i=0; i < root.getSubtypesCount(); ++i) {
      if (included.contains(i)) {
        includeColumnRecursive(types, result, root.getSubtypes(i));
      }
    }
    // if we are filtering at least one column, return the boolean array
    for(boolean include: result) {
      if (!include) {
        return result;
      }
    }
    return null;
  }
}
 
Example #7
Source File: OrcStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public void setLocation(String location, Job job) throws IOException {
    Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
    if (!UDFContext.getUDFContext().isFrontend()) {
        typeInfo = (TypeInfo)ObjectSerializer.deserialize(p.getProperty(signature + SchemaSignatureSuffix));
    } else if (typeInfo == null) {
        typeInfo = getTypeInfo(location, job);
    }
    if (typeInfo != null && oi == null) {
        oi = OrcStruct.createObjectInspector(typeInfo);
    }
    if (!UDFContext.getUDFContext().isFrontend()) {
        if (p.getProperty(signature + RequiredColumnsSuffix) != null) {
            mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p
                    .getProperty(signature + RequiredColumnsSuffix));
            job.getConfiguration().setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,
                    getReqiredColumnIdString(mRequiredColumns));
            if (p.getProperty(signature + SearchArgsSuffix) != null) {
                // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
                job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                        getReqiredColumnNamesString(getSchema(location, job), mRequiredColumns));
            }
        } else if (p.getProperty(signature + SearchArgsSuffix) != null) {
            // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                    getReqiredColumnNamesString(getSchema(location, job)));
        }
        if (p.getProperty(signature + SearchArgsSuffix) != null) {
            job.getConfiguration().set(SARG_PUSHDOWN, p.getProperty(signature + SearchArgsSuffix));
        }

    }
    FileInputFormat.setInputPaths(job, location);
}
 
Example #8
Source File: DataWritableReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}
 
Example #9
Source File: IndexRRecordReader.java    From indexr with Apache License 2.0 5 votes vote down vote up
private void getIncludeColumns(Configuration conf, Segment segment) {
    List<ColumnSchema> segColSchemas = segment.schema().getColumns();
    String columnNamesStr = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
    if (ColumnProjectionUtils.isReadAllColumns(conf) ||
            columnNamesStr == null) {
        projectCols = new ColumnSchema[segColSchemas.size()];
        projectColIds = new int[segColSchemas.size()];
        for (int i = 0; i < segColSchemas.size(); i++) {
            projectCols[i] = segColSchemas.get(i);
            projectColIds[i] = i;
        }
    } else {
        String[] ss = Strings.isEmpty(columnNamesStr.trim()) ? new String[]{} : columnNamesStr.split(",");
        projectCols = new ColumnSchema[ss.length];
        projectColIds = new int[ss.length];
        for (int i = 0; i < ss.length; i++) {
            String col = ss[i];
            int colId = Trick.indexFirst(segColSchemas, c -> c.getName().equalsIgnoreCase(col));
            //Preconditions.checkState(colId >= 0, String.format("Column [%s] not found in segment [%s]", col, segment.name()));
            if (colId < 0) {
                projectCols[i] = null;
                projectColIds[i] = -1;
            } else {
                projectCols[i] = segColSchemas.get(colId);
                projectColIds[i] = colId;
            }
        }
    }
}
 
Example #10
Source File: TestHoodieRealtimeRecordReader.java    From hudi with Apache License 2.0 5 votes vote down vote up
private void setHiveColumnNameProps(List<Schema.Field> fields, JobConf jobConf, boolean isPartitioned) {
  String names = fields.stream().map(Field::name).collect(Collectors.joining(","));
  String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);

  String hiveOrderedColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase(PARTITION_COLUMN))
      .map(Field::name).collect(Collectors.joining(","));
  if (isPartitioned) {
    hiveOrderedColumnNames += "," + PARTITION_COLUMN;
    jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, PARTITION_COLUMN);
  }
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveOrderedColumnNames);
}
 
Example #11
Source File: AbstractRealtimeRecordReader.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Goes through the log files in reverse order and finds the schema from the last available data block. If not, falls
 * back to the schema from the latest parquet file. Finally, sets the partition column and projection fields into the
 * job conf.
 */
private void init() throws IOException {
  Schema schemaFromLogFile =
      LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf);
  if (schemaFromLogFile == null) {
    writerSchema = HoodieRealtimeRecordReaderUtils.readSchema(jobConf, split.getPath());
    LOG.debug("Writer Schema From Parquet => " + writerSchema.getFields());
  } else {
    writerSchema = schemaFromLogFile;
    LOG.debug("Writer Schema From Log => " + writerSchema.getFields());
  }
  // Add partitioning fields to writer schema for resulting row to contain null values for these fields
  String partitionFields = jobConf.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "");
  List<String> partitioningFields =
      partitionFields.length() > 0 ? Arrays.stream(partitionFields.split("/")).collect(Collectors.toList())
          : new ArrayList<>();
  writerSchema = HoodieRealtimeRecordReaderUtils.addPartitionFields(writerSchema, partitioningFields);
  List<String> projectionFields = HoodieRealtimeRecordReaderUtils.orderFields(jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
      jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR), partitioningFields);

  Map<String, Field> schemaFieldsMap = HoodieRealtimeRecordReaderUtils.getNameToFieldMap(writerSchema);
  hiveSchema = constructHiveOrderedSchema(writerSchema, schemaFieldsMap);
  // TODO(vc): In the future, the reader schema should be updated based on log files & be able
  // to null out fields not present before

  readerSchema = HoodieRealtimeRecordReaderUtils.generateProjectionSchema(writerSchema, schemaFieldsMap, projectionFields);
  LOG.info(String.format("About to read compacted logs %s for base split %s, projecting cols %s",
      split.getDeltaLogPaths(), split.getPath(), projectionFields));
}
 
Example #12
Source File: AbstractRealtimeRecordReader.java    From hudi with Apache License 2.0 5 votes vote down vote up
public AbstractRealtimeRecordReader(HoodieRealtimeFileSplit split, JobConf job) {
  this.split = split;
  this.jobConf = job;
  LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR));
  LOG.info("columnIds ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
  LOG.info("partitioningColumns ==> " + job.get(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, ""));
  try {
    this.usesCustomPayload = usesCustomPayload();
    LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
    init();
  } catch (IOException e) {
    throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
  }
}
 
Example #13
Source File: HoodieParquetRealtimeInputFormat.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader<NullWritable, ArrayWritable> getRecordReader(final InputSplit split, final JobConf jobConf,
    final Reporter reporter) throws IOException {
  // Hive on Spark invokes multiple getRecordReaders from different threads in the same spark task (and hence the
  // same JVM) unlike Hive on MR. Due to this, accesses to JobConf, which is shared across all threads, is at the
  // risk of experiencing race conditions. Hence, we synchronize on the JobConf object here. There is negligible
  // latency incurred here due to the synchronization since get record reader is called once per spilt before the
  // actual heavy lifting of reading the parquet files happen.
  if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
    synchronized (jobConf) {
      LOG.info(
          "Before adding Hoodie columns, Projections :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
              + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
      if (jobConf.get(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP) == null) {
        // Hive (across all versions) fails for queries like select count(`_hoodie_commit_time`) from table;
        // In this case, the projection fields gets removed. Looking at HiveInputFormat implementation, in some cases
        // hoodie additional projection columns are reset after calling setConf and only natural projections
        // (one found in select queries) are set. things would break because of this.
        // For e:g _hoodie_record_key would be missing and merge step would throw exceptions.
        // TO fix this, hoodie columns are appended late at the time record-reader gets built instead of construction
        // time.
        cleanProjectionColumnIds(jobConf);
        addRequiredProjectionFields(jobConf);

        this.conf = jobConf;
        this.conf.set(HoodieInputFormatUtils.HOODIE_READ_COLUMNS_PROP, "true");
      }
    }
  }

  LOG.info("Creating record reader with readCols :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)
      + ", Ids :" + jobConf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR));
  // sanity check
  ValidationUtils.checkArgument(split instanceof HoodieRealtimeFileSplit,
      "HoodieRealtimeRecordReader can only work on HoodieRealtimeFileSplit and not with " + split);

  return new HoodieRealtimeRecordReader((HoodieRealtimeFileSplit) split, jobConf,
      super.getRecordReader(split, jobConf, reporter));
}
 
Example #14
Source File: HoodieParquetRealtimeInputFormat.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Hive will append read columns' ids to old columns' ids during getRecordReader. In some cases, e.g. SELECT COUNT(*),
 * the read columns' id is an empty string and Hive will combine it with Hoodie required projection ids and becomes
 * e.g. ",2,0,3" and will cause an error. Actually this method is a temporary solution because the real bug is from
 * Hive. Hive has fixed this bug after 3.0.0, but the version before that would still face this problem. (HIVE-22438)
 */
private static void cleanProjectionColumnIds(Configuration conf) {
  String columnIds = conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
  if (!columnIds.isEmpty() && columnIds.charAt(0) == ',') {
    conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, columnIds.substring(1));
    if (LOG.isDebugEnabled()) {
      LOG.debug("The projection Ids: {" + columnIds + "} start with ','. First comma is removed");
    }
  }
}
 
Example #15
Source File: HoodieMergeOnReadTestUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema,
                                           String basePath) {
  List<Schema.Field> fields = schema.getFields();
  String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
  String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
  Configuration conf = HoodieTestUtils.getDefaultHadoopConf();

  String hiveColumnNames = fields.stream().filter(field -> !field.name().equalsIgnoreCase("datestr"))
      .map(Schema.Field::name).collect(Collectors.joining(","));
  hiveColumnNames = hiveColumnNames + ",datestr";

  String hiveColumnTypes = HoodieAvroUtils.addMetadataColumnTypes(HoodieTestDataGenerator.TRIP_HIVE_COLUMN_TYPES);
  hiveColumnTypes = hiveColumnTypes + ",string";
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  jobConf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMNS, hiveColumnNames);
  conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
  conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
  conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
  conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);

  // Hoodie Input formats are also configurable
  Configurable configurable = (Configurable)inputFormat;
  configurable.setConf(conf);
  jobConf.addResource(conf);
}
 
Example #16
Source File: HdfsSerDeImportService.java    From hadoop-etl-udfs with MIT License 5 votes vote down vote up
private static void initProperties(
        Properties props,
        Configuration conf,
        List<HCatTableColumn> columns,
        List<OutputColumnSpec> outputColumns) throws Exception {
    String colNames = "";
    String colTypes = "";
    for (HCatTableColumn colInfo : columns) {
        colNames += colInfo.getName() + ",";
        colTypes += colInfo.getDataType() + ",";
    }
    if (colNames.length() > 0)
        colNames = colNames.substring(0, colNames.length() - 1);
    if (colTypes.length() > 0)
        colTypes = colTypes.substring(0, colTypes.length() - 1);
    props.put(serdeConstants.LIST_COLUMNS, colNames);
    props.put(serdeConstants.LIST_COLUMN_TYPES, colTypes);
    props.put(serdeConstants.SERIALIZATION_NULL_FORMAT, "NULL");
    // Fix for Avro (NullPointerException if null)
    if (props.getProperty("columns.comments") == null) {
        props.put("columns.comments", "");
    }
    // Pushdown projection if we don't need all columns
    Set<Integer> requiredColumns = new HashSet<>();
    for (OutputColumnSpec spec : outputColumns) {
        if (spec.getColumnPosition() < columns.size()) {
            requiredColumns.add(spec.getColumnPosition());
        }
    }
    if (requiredColumns.size() < columns.size()) {
        ColumnProjectionUtils.appendReadColumns(conf, new ArrayList<>(requiredColumns));
    }
}
 
Example #17
Source File: HiveCassandraStandardColumnInputFormat.java    From Hive-Cassandra with Apache License 2.0 4 votes vote down vote up
@Override
public RecordReader<BytesWritable, MapWritable> getRecordReader(InputSplit split,
    JobConf jobConf, final Reporter reporter) throws IOException {
  HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split;

  List<String> columns = AbstractColumnSerDe.parseColumnMapping(cassandraSplit.getColumnMapping());
  isTransposed = AbstractColumnSerDe.isTransposed(columns);


  List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

  if (columns.size() < readColIDs.size()) {
    throw new IOException("Cannot read more columns than the given table contains.");
  }

  org.apache.cassandra.hadoop.ColumnFamilySplit cfSplit = cassandraSplit.getSplit();
  Job job = new Job(jobConf);

  TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {
    @Override
    public void progress() {
      reporter.progress();
    }
  };

  SlicePredicate predicate = new SlicePredicate();

  if (isTransposed || readColIDs.size() == columns.size() || readColIDs.size() == 0) {
    SliceRange range = new SliceRange();
    AbstractType comparator = BytesType.instance;

    String comparatorType = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_COMPARATOR);
    if (comparatorType != null && !comparatorType.equals("")) {
      try {
        comparator = TypeParser.parse(comparatorType);
      } catch (Exception ex) {
        throw new IOException("Comparator class not found.");
      }
    }

    String sliceStart = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_START);
    String sliceEnd = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_FINISH);
    String reversed = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_REVERSED);

    range.setStart(comparator.fromString(sliceStart == null ? "" : sliceStart));
    range.setFinish(comparator.fromString(sliceEnd == null ? "" : sliceEnd));
    range.setReversed(reversed == null ? false : reversed.equals("true"));
    range.setCount(cassandraSplit.getSlicePredicateSize());
    predicate.setSlice_range(range);
  } else {
    int iKey = columns.indexOf(AbstractColumnSerDe.CASSANDRA_KEY_COLUMN);
    predicate.setColumn_names(getColumnNames(iKey, columns, readColIDs));
  }


  try {
    ConfigHelper.setInputColumnFamily(tac.getConfiguration(),
        cassandraSplit.getKeyspace(), cassandraSplit.getColumnFamily());

    ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate);
    ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize());
    ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + "");
    ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost());
    ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner());
    // Set Split Size
    ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize());

    CassandraHiveRecordReader rr = null;

    if(isTransposed && tac.getConfiguration().getBoolean(AbstractColumnSerDe.CASSANDRA_ENABLE_WIDEROW_ITERATOR, true)) {
      rr = new CassandraHiveRecordReader(new ColumnFamilyWideRowRecordReader(), isTransposed);
    } else {
      rr = new CassandraHiveRecordReader(new ColumnFamilyRecordReader(), isTransposed);
    }
    rr.initialize(cfSplit, tac);

    return rr;

  } catch (Exception ie) {
    throw new IOException(ie);
  }
}
 
Example #18
Source File: HiveReaderSetting.java    From multiple-dimension-spread with Apache License 2.0 4 votes vote down vote up
public HiveReaderSetting( final FileSplit split, final JobConf job ){
  config = new Configuration();

  disableSkipBlock = job.getBoolean( "mds.disable.block.skip" , false );
  disableFilterPushdown = job.getBoolean( "mds.disable.filter.pushdown" , false );

  Set<String> pathNameSet= createPathSet( split.getPath() );
  List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>();
  String filterExprSerialized = job.get( TableScanDesc.FILTER_EXPR_CONF_STR );
  if( filterExprSerialized != null ){
    filterExprs.add( Utilities.deserializeExpression(filterExprSerialized) );
  }

  MapWork mapWork;
  try{
    mapWork = Utilities.getMapWork(job);
  }catch( Exception e ){
    mapWork = null;
  }

  if( mapWork == null ){
    node = createExpressionNode( filterExprs );
    isVectorModeFlag = false;
    return;
  }

  node = createExpressionNode( filterExprs );

  for( Map.Entry<String,PartitionDesc> pathsAndParts: mapWork.getPathToPartitionInfo().entrySet() ){
    if( ! pathNameSet.contains( pathsAndParts.getKey() ) ){
      continue;
    }
    Properties props = pathsAndParts.getValue().getTableDesc().getProperties();
    if( props.containsKey( "mds.expand" ) ){
      config.set( "spread.reader.expand.column" , props.getProperty( "mds.expand" ) );
    }
    if( props.containsKey( "mds.flatten" ) ){
      config.set( "spread.reader.flatten.column" , props.getProperty( "mds.flatten" ) );
    }
  }

  config.set( "spread.reader.read.column.names" , createReadColumnNames( job.get( ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR , null ) ) );

  // Next Hive vesion;
  // Utilities.getUseVectorizedInputFileFormat(job)
  isVectorModeFlag = Utilities.isVectorMode( job );
}