org.apache.hive.hcatalog.data.HCatRecord Java Examples

The following examples show how to use org.apache.hive.hcatalog.data.HCatRecord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FastIndexMapper.java    From ES-Fastloader with Apache License 2.0 6 votes vote down vote up
@Override
protected void map(Object key, HCatRecord value, Context context) throws IOException, InterruptedException {
    DefaultHCatRecord hCatRecord = (DefaultHCatRecord) value;
    int shardNo;

    List<String> keyList = taskConfig.getKeyList();
    if(keyList==null || keyList.size()==0) {
        shardNo = (int) (Math.random()*templateConfig.getReducerNum());
    } else {
        String keyStr = getKeyValue(keyList, hCatRecord);
        shardNo = CommonUtils.getShardId(keyStr, templateConfig.getReducerNum());
    }

    //shard分片个数与reduce个数一样
    context.write(new IntWritable(shardNo), hCatRecord);
}
 
Example #2
Source File: HCatalogIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public PCollection<HCatRecord> expand(PBegin input) {
  checkArgument(getTable() != null, "withTable() is required");
  checkArgument(getConfigProperties() != null, "withConfigProperties() is required");
  Watch.Growth<Read, Integer, Integer> growthFn;
  if (getPollingInterval() != null) {
    growthFn = Watch.growthOf(new PartitionPollerFn()).withPollInterval(getPollingInterval());
    if (getTerminationCondition() != null) {
      growthFn = growthFn.withTerminationPerInput(getTerminationCondition());
    }
    return input
        .apply("ConvertToReadRequest", Create.of(this))
        .apply("WatchForNewPartitions", growthFn)
        .apply("PartitionReader", ParDo.of(new PartitionReaderFn(getConfigProperties())));
  } else {
    // Treat as Bounded
    checkArgument(
        getTerminationCondition() == null,
        "withTerminationCondition() is not required when using in bounded reads mode");
    return input.apply(org.apache.beam.sdk.io.Read.from(new BoundedHCatalogSource(this)));
  }
}
 
Example #3
Source File: IIDistinctColumnsMapper.java    From Kylin with Apache License 2.0 6 votes vote down vote up
@Override
public void map(KEYIN key, HCatRecord record, Context context) throws IOException, InterruptedException {

    HCatFieldSchema fieldSchema = null;
    for (short i = 0; i < columnSize; i++) {
        outputKey.set(i);
        fieldSchema = schema.get(i);
        Object fieldValue = record.get(fieldSchema.getName(), schema);
        if (fieldValue == null)
            continue;
        byte[] bytes = Bytes.toBytes(fieldValue.toString());
        outputValue.set(bytes, 0, bytes.length);
        context.write(outputKey, outputValue);
    }

}
 
Example #4
Source File: ColumnCardinalityMapper.java    From Kylin with Apache License 2.0 6 votes vote down vote up
@Override
public void map(T key, HCatRecord value, Context context) throws IOException, InterruptedException {

    HCatFieldSchema field;
    Object fieldValue;
    for (int m = 0; m < columnSize; m++) {
        field = schema.get(m);
        fieldValue = value.get(field.getName(), schema);
        if (fieldValue == null)
            fieldValue = "NULL";
        
        if (counter < 5 && m < 10) {
            System.out.println("Get row " + counter + " column '" + field.getName() + "'  value: " + fieldValue);
        }

        if (fieldValue != null)
            getHllc(m).add(Bytes.toBytes(fieldValue.toString()));
    }

    counter++;
}
 
Example #5
Source File: HCatalogIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Test of Read using SourceTestUtils.assertSourcesEqualReferenceSource(..). */
@Test
@NeedsTestData
public void testSourceEqualsSplits() throws Exception {
  final int numRows = 1500;
  final int numSamples = 10;
  final long bytesPerRow = 15;
  ReaderContext context = getReaderContext(getConfigPropertiesAsMap(service.getHiveConf()));
  HCatalogIO.Read spec =
      HCatalogIO.read()
          .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
          .withContext(context)
          .withTable(TEST_TABLE);

  BoundedHCatalogSource source = new BoundedHCatalogSource(spec);
  List<BoundedSource<HCatRecord>> unSplitSource = source.split(-1, OPTIONS);
  assertEquals(1, unSplitSource.size());

  List<BoundedSource<HCatRecord>> splits =
      source.split(numRows * bytesPerRow / numSamples, OPTIONS);
  assertTrue(splits.size() >= 1);

  SourceTestUtils.assertSourcesEqualReferenceSource(unSplitSource.get(0), splits, OPTIONS);
}
 
Example #6
Source File: TableDataBuilderTest.java    From HiveRunner with Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedSimple() {
  HCatTable table = table().cols(columns(COLUMN_1)).partCols(columns(PARTITION_COLUMN_1));

  Multimap<Map<String, String>, HCatRecord> data = new TableDataBuilder(table)
      .addRow("value", "partition_value")
      .build();

  assertEquals(1, data.size());

  Map<String, String> partitionSpec = new HashMap<>();
  partitionSpec.put(PARTITION_COLUMN_1, "partition_value");

  Collection<HCatRecord> rows = data.get(partitionSpec);
  assertEquals(1, rows.size());
  HCatRecord row = rows.iterator().next();
  assertEquals(Arrays.asList((Object) "value", "partition_value"), row.getAll());
}
 
Example #7
Source File: HCatalogIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Test of Read using SourceTestUtils.readFromSource(..). */
@Test
@NeedsTestData
public void testReadFromSource() throws Exception {
  ReaderContext context = getReaderContext(getConfigPropertiesAsMap(service.getHiveConf()));
  HCatalogIO.Read spec =
      HCatalogIO.read()
          .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
          .withContext(context)
          .withTable(TEST_TABLE);

  List<String> records = new ArrayList<>();
  for (int i = 0; i < context.numSplits(); i++) {
    BoundedHCatalogSource source = new BoundedHCatalogSource(spec.withSplitId(i));
    for (HCatRecord record : SourceTestUtils.readFromSource(source, OPTIONS)) {
      records.add(record.get(0).toString());
    }
  }
  assertThat(records, containsInAnyOrder(getExpectedRecords(TEST_RECORDS_COUNT).toArray()));
}
 
Example #8
Source File: FactDistinctColumnsMapper.java    From Kylin with Apache License 2.0 6 votes vote down vote up
@Override
public void map(KEYIN key, HCatRecord record, Context context) throws IOException, InterruptedException {

    try {

        int[] flatTableIndexes = intermediateTableDesc.getRowKeyColumnIndexes();
        HCatFieldSchema fieldSchema = null;
        for (int i : factDictCols) {
            outputKey.set((short) i);
            fieldSchema = schema.get(flatTableIndexes[i]);
            Object fieldValue = record.get(fieldSchema.getName(), schema);
            if (fieldValue == null)
                continue;
            byte[] bytes = Bytes.toBytes(fieldValue.toString());
            outputValue.set(bytes, 0, bytes.length);
            context.write(outputKey, outputValue);
        }
    } catch (Exception ex) {
        handleErrorRecord(record, ex);
    }

}
 
Example #9
Source File: HCatalogIO.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Calculates the 'desired' number of splits based on desiredBundleSizeBytes which is passed as
 * a hint to native API. Retrieves the actual splits generated by native API, which could be
 * different from the 'desired' split count calculated using desiredBundleSizeBytes
 */
@Override
public List<BoundedSource<HCatRecord>> split(
    long desiredBundleSizeBytes, PipelineOptions options) throws Exception {
  int desiredSplitCount = 1;
  long estimatedSizeBytes = getEstimatedSizeBytes(options);
  if (desiredBundleSizeBytes > 0 && estimatedSizeBytes > 0) {
    desiredSplitCount = (int) Math.ceil((double) estimatedSizeBytes / desiredBundleSizeBytes);
  }
  ReaderContext readerContext = getReaderContext(desiredSplitCount);
  // process the splits returned by native API
  // this could be different from 'desiredSplitCount' calculated above
  LOG.info(
      "Splitting into bundles of {} bytes: "
          + "estimated size {}, desired split count {}, actual split count {}",
      desiredBundleSizeBytes,
      estimatedSizeBytes,
      desiredSplitCount,
      readerContext.numSplits());

  List<BoundedSource<HCatRecord>> res = new ArrayList<>();
  for (int split = 0; split < readerContext.numSplits(); split++) {
    res.add(new BoundedHCatalogSource(spec.withContext(readerContext).withSplitId(split)));
  }
  return res;
}
 
Example #10
Source File: HCatalogTestUtils.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
private List<HCatRecord> generateHCatRecords(int numRecords,
  HCatSchema hCatTblSchema, ColumnGenerator... extraCols) throws Exception {
  List<HCatRecord> records = new ArrayList<HCatRecord>();
  List<HCatFieldSchema> hCatTblCols = hCatTblSchema.getFields();
  int size = hCatTblCols.size();
  for (int i = 0; i < numRecords; ++i) {
    DefaultHCatRecord record = new DefaultHCatRecord(size);
    record.set(hCatTblCols.get(0).getName(), hCatTblSchema, i);
    record.set(hCatTblCols.get(1).getName(), hCatTblSchema, "textfield" + i);
    int idx = 0;
    for (int j = 0; j < extraCols.length; ++j) {
      if (extraCols[j].getKeyType() == KeyType.STATIC_KEY) {
        continue;
      }
      record.set(hCatTblCols.get(idx + 2).getName(), hCatTblSchema,
        extraCols[j].getHCatValue(i));
      ++idx;
    }

    records.add(record);
  }
  return records;
}
 
Example #11
Source File: TableDataInserter.java    From HiveRunner with Apache License 2.0 6 votes vote down vote up
private void insert(Map<String, String> partitionSpec, Iterable<HCatRecord> rows) {
  WriteEntity entity = new WriteEntity.Builder()
      .withDatabase(databaseName)
      .withTable(tableName)
      .withPartition(partitionSpec)
      .build();

  try {
    HCatWriter master = DataTransferFactory.getHCatWriter(entity, config);
    WriterContext context = master.prepareWrite();
    HCatWriter writer = DataTransferFactory.getHCatWriter(context);
    writer.write(rows.iterator());
    master.commit(context);
  } catch (HCatException e) {
    throw new RuntimeException("An error occurred while inserting data to " + databaseName + "." + tableName, e);
  }
}
 
Example #12
Source File: InvertedIndexMapper.java    From Kylin with Apache License 2.0 5 votes vote down vote up
@Override
public void map(KEYIN key, HCatRecord record, Context context) throws IOException, InterruptedException {

    rec.reset();
    for (int i = 0; i < fields.size(); i++) {
        Object fieldValue = record.get(i);
        rec.setValueString(i, fieldValue == null? null : fieldValue.toString());
    }

    outputKey.set(rec.getTimestamp());
    // outputValue's backing bytes array is the same as rec

    context.write(outputKey, outputValue);
}
 
Example #13
Source File: TableDataBuilderTest.java    From HiveRunner with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedEmptyRow() {
  HCatTable table = table().cols(columns(COLUMN_1));

  Multimap<Map<String, String>, HCatRecord> data = new TableDataBuilder(table).newRow().build();

  assertEquals(1, data.size());
  Iterator<HCatRecord> iterator = data.values().iterator();
  HCatRecord row = iterator.next();
  assertEquals(Arrays.asList((Object) null), row.getAll());
}
 
Example #14
Source File: PartitionReaderFn.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  final Read readRequest = c.element().getKey();
  final Integer partitionIndexToRead = c.element().getValue();
  ReaderContext readerContext = getReaderContext(readRequest, partitionIndexToRead);
  for (int i = 0; i < readerContext.numSplits(); i++) {
    HCatReader reader = DataTransferFactory.getHCatReader(readerContext, i);
    Iterator<HCatRecord> hcatIterator = reader.read();
    while (hcatIterator.hasNext()) {
      final HCatRecord record = hcatIterator.next();
      c.output(record);
    }
  }
}
 
Example #15
Source File: HCatalogTestUtils.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
public String hCatRecordDump(List<HCatRecord> recs,
  HCatSchema schema) throws Exception {
  List<String> fields = schema.getFieldNames();
  int count = 0;
  StringBuilder sb = new StringBuilder(1024);
  for (HCatRecord rec : recs) {
    sb.append("HCat Record : " + ++count).append('\n');
    for (String field : fields) {
      sb.append('\t').append(field).append('=');
      sb.append(rec.get(field, schema)).append('\n');
      sb.append("\n\n");
    }
  }
  return sb.toString();
}
 
Example #16
Source File: NetezzaExternalTableHCatExportMapper.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
@Override
public void map(LongWritable key, HCatRecord hcr, Context context)
  throws IOException, InterruptedException {
  SqoopRecord sqr = helper.convertToSqoopRecord(hcr);
  writeSqoopRecord(sqr);
  context.progress();
}
 
Example #17
Source File: HiveTableReader.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public static String[] getRowAsStringArray(HCatRecord record) {
    String[] arr = new String[record.size()];
    for (int i = 0; i < arr.length; i++) {
        Object o = record.get(i);
        arr[i] = (o == null || "\\N".equals(o)) ? null : o.toString();
    }
    return arr;
}
 
Example #18
Source File: TableDataBuilderTest.java    From HiveRunner with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnpartitionedWithColumnMask() {
  HCatTable table = table().cols(columns(COLUMN_1));

  Multimap<Map<String, String>, HCatRecord> data = new TableDataBuilder(table)
      .withColumns(COLUMN_1)
      .addRow("value")
      .build();

  assertEquals(1, data.size());
  Iterator<HCatRecord> iterator = data.values().iterator();
  HCatRecord row = iterator.next();
  assertEquals(Arrays.asList((Object) "value"), row.getAll());
}
 
Example #19
Source File: TableDataBuilderTest.java    From HiveRunner with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedMultiplePartitionsAndRows() {
  HCatTable table = table().cols(columns(COLUMN_1)).partCols(columns(PARTITION_COLUMN_1));

  Multimap<Map<String, String>, HCatRecord> data = new TableDataBuilder(table)
      .addRow("value1", "partition_value1")
      .addRow("value2", "partition_value1")
      .addRow("value3", "partition_value2")
      .addRow("value4", "partition_value2")
      .build();

  assertEquals(4, data.size());

  Map<String, String> partitionSpec = new HashMap<>();
  partitionSpec.put(PARTITION_COLUMN_1, "partition_value1");

  Collection<HCatRecord> rows = data.get(partitionSpec);
  assertEquals(2, rows.size());
  Iterator<HCatRecord> iterator = rows.iterator();
  HCatRecord row = iterator.next();
  assertEquals(Arrays.asList((Object) "value1", "partition_value1"), row.getAll());
  row = iterator.next();
  assertEquals(Arrays.asList((Object) "value2", "partition_value1"), row.getAll());

  partitionSpec = new HashMap<>();
  partitionSpec.put(PARTITION_COLUMN_1, "partition_value2");

  rows = data.get(partitionSpec);
  assertEquals(2, rows.size());
  iterator = rows.iterator();
  row = iterator.next();
  assertEquals(Arrays.asList((Object) "value3", "partition_value2"), row.getAll());
  row = iterator.next();
  assertEquals(Arrays.asList((Object) "value4", "partition_value2"), row.getAll());
}
 
Example #20
Source File: TableDataBuilder.java    From HiveRunner with Apache License 2.0 5 votes vote down vote up
TableDataBuilder copyRow() {
  checkState(row != null, "No previous row to copy.");
  HCatRecord copy = new DefaultHCatRecord(new ArrayList<>(row.getAll()));
  flushRow();
  row = copy;
  return this;
}
 
Example #21
Source File: HCatalogIOTestUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Returns a list of HCatRecords of passed size. */
public static List<HCatRecord> buildHCatRecords(int size) {
  List<HCatRecord> expected = new ArrayList<>();
  for (int i = 0; i < size; i++) {
    expected.add(toHCatRecord(i));
  }
  return expected;
}
 
Example #22
Source File: FactDistinctColumnsMapper.java    From Kylin with Apache License 2.0 5 votes vote down vote up
private void handleErrorRecord(HCatRecord record, Exception ex) throws IOException {

        System.err.println("Insane record: " + record.getAll());
        ex.printStackTrace(System.err);

        errorRecordCounter++;
        if (errorRecordCounter > BatchConstants.ERROR_RECORD_THRESHOLD) {
            if (ex instanceof IOException)
                throw (IOException) ex;
            else if (ex instanceof RuntimeException)
                throw (RuntimeException) ex;
            else
                throw new RuntimeException("", ex);
        }
    }
 
Example #23
Source File: HCatInputFormatBase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public T nextRecord(T record) throws IOException {
	if (!this.fetched) {
		// first record
		fetchNext();
	}
	if (!this.hasNext) {
		return null;
	}
	try {

		// get next HCatRecord
		HCatRecord v = this.recordReader.getCurrentValue();
		this.fetched = false;

		if (this.fieldNames.length > 0) {
			// return as Flink tuple
			return this.buildFlinkTuple(record, v);

		} else {
			// return as HCatRecord
			return (T) v;
		}

	} catch (InterruptedException e) {
		throw new IOException("Could not get next record.", e);
	}
}
 
Example #24
Source File: HCatalogIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public HCatRecord getCurrent() {
  if (current == null) {
    throw new NoSuchElementException("Current element is null");
  }
  return current;
}
 
Example #25
Source File: HCatalogIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<HCatRecord> input) {
  checkArgument(getConfigProperties() != null, "withConfigProperties() is required");
  checkArgument(getTable() != null, "withTable() is required");
  input.apply(ParDo.of(new WriteFn(this)));
  return PDone.in(input.getPipeline());
}
 
Example #26
Source File: HCatalogIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Perform end-to-end test of Write-then-Read operation. */
@Test
@NeedsEmptyTestTables
public void testWriteThenReadSuccess() {
  defaultPipeline
      .apply(Create.of(buildHCatRecords(TEST_RECORDS_COUNT)))
      .apply(
          HCatalogIO.write()
              .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
              .withDatabase(TEST_DATABASE)
              .withTable(TEST_TABLE)
              .withPartition(new java.util.HashMap<>())
              .withBatchSize(512L));
  defaultPipeline.run();

  PCollection<String> output =
      readAfterWritePipeline
          .apply(
              HCatalogIO.read()
                  .withConfigProperties(getConfigPropertiesAsMap(service.getHiveConf()))
                  .withDatabase(TEST_DATABASE)
                  .withTable(TEST_TABLE)
                  .withFilter(TEST_FILTER))
          .apply(
              ParDo.of(
                  new DoFn<HCatRecord, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                      c.output(c.element().get(0).toString());
                    }
                  }));
  PAssert.that(output).containsInAnyOrder(getExpectedRecords(TEST_RECORDS_COUNT));
  readAfterWritePipeline.run();
}
 
Example #27
Source File: HiveTableReader.java    From kylin with Apache License 2.0 5 votes vote down vote up
public static List<String> getRowAsList(HCatRecord record, List<String> rowValues) {
    List<Object> allFields = record.getAll();
    for (Object o : allFields) {
        rowValues.add((o == null) ? null : o.toString());
    }
    return rowValues;
}
 
Example #28
Source File: HiveTableReader.java    From kylin with Apache License 2.0 5 votes vote down vote up
public static String[] getRowAsStringArray(HCatRecord record) {
    String[] arr = new String[record.size()];
    for (int i = 0; i < arr.length; i++) {
        Object o = record.get(i);
        arr[i] = (o == null || "\\N".equals(o)) ? null : o.toString();
    }
    return arr;
}
 
Example #29
Source File: HCatInputFormatBase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public T nextRecord(T record) throws IOException {
	if (!this.fetched) {
		// first record
		fetchNext();
	}
	if (!this.hasNext) {
		return null;
	}
	try {

		// get next HCatRecord
		HCatRecord v = this.recordReader.getCurrentValue();
		this.fetched = false;

		if (this.fieldNames.length > 0) {
			// return as Flink tuple
			return this.buildFlinkTuple(record, v);

		} else {
			// return as HCatRecord
			return (T) v;
		}

	} catch (InterruptedException e) {
		throw new IOException("Could not get next record.", e);
	}
}
 
Example #30
Source File: FactDistinctColumnsMapperTest.java    From kylin with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapper() throws IOException {
    Configuration configuration = mapDriver.getConfiguration();
    configuration.set(BatchConstants.CFG_STATISTICS_SAMPLING_PERCENT, "100");
    configuration.set(BatchConstants.CFG_CUBE_NAME, "test_kylin_cube_with_slr_1_new_segment");
    configuration.set(BatchConstants.CFG_CUBE_SEGMENT_ID, "198va32a-a33e-4b69-83dd-0bb8b1f8c53b");
    HCatRecord value1 = new DefaultHCatRecord(11);
    value1.set(0, "2012-08-16");
    value1.set(1, "48027");
    value1.set(2, "0");
    value1.set(3, "Home & Garden");
    value1.set(4, "Cheese & Crackers");
    value1.set(5, "Cheese & Crackers");
    value1.set(6, "48027");
    value1.set(7, "16");
    value1.set(8, "10000010");
    value1.set(9, "204.28");
    value1.set(10, "5");
    mapDriver.addInput(new LongWritable(0), value1);

    List<Pair<SelfDefineSortableKey, Text>> result = mapDriver.run();
    int colsNeedDictSize = cubeDesc.getAllColumnsNeedDictionaryBuilt().size();
    int cuboidsCnt = cubeDesc.getAllCuboids().size();

    assertEquals(
            colsNeedDictSize + (cubeDesc.getRowkey().getRowKeyColumns().length - colsNeedDictSize) * 2 + cuboidsCnt,
            result.size());
}