Java Code Examples for org.apache.hadoop.mapred.RecordReader#createKey()

The following examples show how to use org.apache.hadoop.mapred.RecordReader#createKey() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestHoodieParquetInputFormat.java From hudi with Apache License 2.0

6 votes

private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
    int totalExpected) throws IOException {
  int actualCount = 0;
  int totalCount = 0;
  InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
  for (InputSplit split : splits) {
    RecordReader<NullWritable, ArrayWritable> recordReader = inputFormat.getRecordReader(split, jobConf, null);
    NullWritable key = recordReader.createKey();
    ArrayWritable writable = recordReader.createValue();

    while (recordReader.next(key, writable)) {
      // writable returns an array with [field1, field2, _hoodie_commit_time,
      // _hoodie_commit_seqno]
      // Take the commit time and compare with the one we are interested in
      if (commit.equals((writable.get()[2]).toString())) {
        actualCount++;
      }
      totalCount++;
    }
  }
  assertEquals(expectedNumberOfRecordsInCommit, actualCount, msg);
  assertEquals(totalExpected, totalCount, msg);
}

Example 2

Source File: InputSampler.java From hadoop with Apache License 2.0

6 votes

/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 3

Source File: InputSampler.java From big-c with Apache License 2.0

6 votes

/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      ++records;
      if ((double) kept / records < freq) {
        ++kept;
        samples.add(key);
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 4

Source File: InputSampler.java From big-c with Apache License 2.0

6 votes

/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);
  int splitStep = splits.length / splitsToSample;
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i * splitStep],
        job, Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      samples.add(key);
      key = reader.createKey();
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 5

Source File: MneMapredLongDataTest.java From mnemonic with Apache License 2.0

5 votes

@Test(enabled = true, dependsOnMethods = {"testWriteLongData"})
public void testReadLongData() throws Exception {
  long sum = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
            && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
            && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
              new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat =
              new MneInputFormat<MneDurableInputValue<Long>, Long>();
      RecordReader<NullWritable, MneDurableInputValue<Long>> reader =
              inputFormat.getRecordReader(split, m_conf, null);
      MneDurableInputValue<Long> mdval = null;
      NullWritable mdkey = reader.createKey();
      while (true) {
        mdval = reader.createValue();
        if (reader.next(mdkey, mdval)) {
          sum += mdval.getValue();
          ++reccnt;
        } else {
          break;
        }
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_sum, sum);
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  System.out.println(String.format("The checksum of long data is %d", sum));
}

Example 6

Source File: DumpTypedBytes.java From big-c with Apache License 2.0

5 votes

/**
 * Dump given list of files to standard output as typed bytes.
 */
@SuppressWarnings("unchecked")
private int dumpTypedBytes(List<FileStatus> files) throws IOException {
  JobConf job = new JobConf(getConf()); 
  DataOutputStream dout = new DataOutputStream(System.out);
  AutoInputFormat autoInputFormat = new AutoInputFormat();
  for (FileStatus fileStatus : files) {
    FileSplit split = new FileSplit(fileStatus.getPath(), 0,
      fileStatus.getLen() * fileStatus.getBlockSize(),
      (String[]) null);
    RecordReader recReader = null;
    try {
      recReader = autoInputFormat.getRecordReader(split, job, Reporter.NULL);
      Object key = recReader.createKey();
      Object value = recReader.createValue();
      while (recReader.next(key, value)) {
        if (key instanceof Writable) {
          TypedBytesWritableOutput.get(dout).write((Writable) key);
        } else {
          TypedBytesOutput.get(dout).write(key);
        }
        if (value instanceof Writable) {
          TypedBytesWritableOutput.get(dout).write((Writable) value);
        } else {
          TypedBytesOutput.get(dout).write(value);
        }
      }
    } finally {
      if (recReader != null) {
        recReader.close();
      }
    }
  }
  dout.flush();
  return 0;
}

Example 7

Source File: DumpTypedBytes.java From hadoop with Apache License 2.0

5 votes

/**
 * Dump given list of files to standard output as typed bytes.
 */
@SuppressWarnings("unchecked")
private int dumpTypedBytes(List<FileStatus> files) throws IOException {
  JobConf job = new JobConf(getConf()); 
  DataOutputStream dout = new DataOutputStream(System.out);
  AutoInputFormat autoInputFormat = new AutoInputFormat();
  for (FileStatus fileStatus : files) {
    FileSplit split = new FileSplit(fileStatus.getPath(), 0,
      fileStatus.getLen() * fileStatus.getBlockSize(),
      (String[]) null);
    RecordReader recReader = null;
    try {
      recReader = autoInputFormat.getRecordReader(split, job, Reporter.NULL);
      Object key = recReader.createKey();
      Object value = recReader.createValue();
      while (recReader.next(key, value)) {
        if (key instanceof Writable) {
          TypedBytesWritableOutput.get(dout).write((Writable) key);
        } else {
          TypedBytesOutput.get(dout).write(key);
        }
        if (value instanceof Writable) {
          TypedBytesWritableOutput.get(dout).write((Writable) value);
        } else {
          TypedBytesOutput.get(dout).write(value);
        }
      }
    } finally {
      if (recReader != null) {
        recReader.close();
      }
    }
  }
  dout.flush();
  return 0;
}

Example 8

Source File: MneMapredPersonDataTest.java From mnemonic with Apache License 2.0

5 votes

@Test(enabled = true, dependsOnMethods = {"testWritePersonData"})
public void testReadPersonData() throws Exception {
  long sumage = 0L;
  long reccnt = 0L;
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
            && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
            && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
      FileSplit split = new FileSplit(
              new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]);
      InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat =
              new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
      RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader =
              inputFormat.getRecordReader(split, m_conf, null);
      MneDurableInputValue<Person<Long>> personval = null;
      NullWritable personkey = reader.createKey();
      while (true) {
        personval = reader.createValue();
        if (reader.next(personkey, personval)) {
          AssertJUnit.assertTrue(personval.getValue().getAge() < 51);
          sumage += personval.getValue().getAge();
          ++reccnt;
        } else {
          break;
        }
      }
      reader.close();
    }
  }
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  AssertJUnit.assertEquals(m_sumage, sumage);
  System.out.println(String.format("The checksum of ages is %d", sumage));
}

Example 9

Source File: EventInputFormatTest.java From gemfirexd-oss with Apache License 2.0

4 votes

private void doTestRowSerDe(boolean concurrencyChecks) throws Exception {
  getConnection();
  Connection conn = startNetserverAndGetLocalNetConnection();
  final long statTS = System.currentTimeMillis();
  Statement st = conn.createStatement();
  st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "' batchtimeinterval 5000 milliseconds");
  String concurrency = "persistent ENABLE CONCURRENCY CHECKS";
  st.execute("create table app.mytab1 (col1 int primary key, col2 varchar(100)) partition by primary key buckets 1 hdfsstore (myhdfs) "
      +(concurrencyChecks ? concurrency : ""));

  PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)");
  ps.setInt(1, 1);
  ps.setString(2, "Value-1");
  ps.execute();
  
  //Wait for data to get to HDFS...
  String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1");
  st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)");
  
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.get(conf);
  FileStatus[] list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/"));
  assertEquals(1, list.length);
  
  conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1");
  conf.set(RowInputFormat.HOME_DIR, HDFS_DIR);
  
  JobConf job = new JobConf(conf);
  job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
  RowInputFormat ipformat = new RowInputFormat();
  InputSplit[] splits = ipformat.getSplits(job, 2);
  assertEquals(1, splits.length);
  RecordReader<Key, Row> rr = ipformat.getRecordReader(splits[0], job, null);
  Key key = rr.createKey();
  Row value = rr.createValue();
  assertTrue(rr.next(key, value));
  assertEquals(1, value.getRowAsResultSet().getInt(1));
  assertEquals("Value-1", value.getRowAsResultSet().getString(2));
  assertTrue(value.getTimestamp() > statTS);
  assertFalse(value.getRowAsResultSet().next());
  
  ByteArrayOutputStream baos = new ByteArrayOutputStream();
  DataOutputStream dos = new DataOutputStream(baos);
  value.write(dos);
  dos.close();
  
  byte[] buf = baos.toByteArray();
  DataInputStream dis = new DataInputStream(new ByteArrayInputStream(buf));
  Row row = new Row();
  row.readFields(dis);
  dis.close();
  
  assertEquals(1, row.getRowAsResultSet().getInt(1));
  assertEquals("Value-1", row.getRowAsResultSet().getString(2));
  assertFalse(value.getRowAsResultSet().next());
  
  TestUtil.shutDown();
}

Example 10

Source File: MneMapredChunkDataTest.java From mnemonic with Apache License 2.0

4 votes

@Test(enabled = true, dependsOnMethods = {"testWriteChunkData"})
public void testReadChunkData() throws Exception {
  List<String> partfns = new ArrayList<String>();
  long reccnt = 0L;
  long tsize = 0L;
  Checksum cs = new CRC32();
  cs.reset();
  File folder = new File(m_workdir.toString());
  File[] listfiles = folder.listFiles();
  for (int idx = 0; idx < listfiles.length; ++idx) {
    if (listfiles[idx].isFile()
            && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
            && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
      partfns.add(listfiles[idx].getName());
    }
  }
  Collections.sort(partfns); // keep the order for checksum
  for (int idx = 0; idx < partfns.size(); ++idx) {
    System.out.println(String.format("Verifying : %s", partfns.get(idx)));
    FileSplit split = new FileSplit(
            new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]);
    InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat =
            new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>();
    RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader =
            inputFormat.getRecordReader(split, m_conf, null);
    MneDurableInputValue<DurableChunk<?>> dchkval = null;
    NullWritable dchkkey = reader.createKey();
    while (true) {
      dchkval = reader.createValue();
      if (reader.next(dchkkey, dchkval)) {
        byte b;
        for (int j = 0; j < dchkval.getValue().getSize(); ++j) {
          b = unsafe.getByte(dchkval.getValue().get() + j);
          cs.update(b);
        }
        tsize += dchkval.getValue().getSize();
        ++reccnt;
      } else {
        break;
      }
    }
    reader.close();
  }
  AssertJUnit.assertEquals(m_reccnt, reccnt);
  AssertJUnit.assertEquals(m_totalsize, tsize);
  AssertJUnit.assertEquals(m_checksum, cs.getValue());
  System.out.println(String.format("The checksum of chunk is %d", m_checksum));
}

Example 11

Source File: MultithreadedMapRunner.java From big-c with Apache License 2.0

4 votes

public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output,
                Reporter reporter)
  throws IOException {
  try {
    // allocate key & value instances these objects will not be reused
    // because execution of Mapper.map is not serialized.
    K1 key = input.createKey();
    V1 value = input.createValue();

    while (input.next(key, value)) {

      executorService.execute(new MapperInvokeRunable(key, value, output,
                              reporter));

      checkForExceptionsFromProcessingThreads();

      // Allocate new key & value instances as mapper is running in parallel
      key = input.createKey();
      value = input.createValue();
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug("Finished dispatching all Mappper.map calls, job "
                + job.getJobName());
    }

    // Graceful shutdown of the Threadpool, it will let all scheduled
    // Runnables to end.
    executorService.shutdown();

    try {

      // Now waiting for all Runnables to end.
      while (!executorService.awaitTermination(100, TimeUnit.MILLISECONDS)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Awaiting all running Mappper.map calls to finish, job "
                    + job.getJobName());
        }

        // NOTE: while Mapper.map dispatching has concluded there are still
        // map calls in progress and exceptions would be thrown.
        checkForExceptionsFromProcessingThreads();

      }

      // NOTE: it could be that a map call has had an exception after the
      // call for awaitTermination() returing true. And edge case but it
      // could happen.
      checkForExceptionsFromProcessingThreads();

    } catch (IOException ioEx) {
      // Forcing a shutdown of all thread of the threadpool and rethrowing
      // the IOException
      executorService.shutdownNow();
      throw ioEx;
    } catch (InterruptedException iEx) {
      throw new RuntimeException(iEx);
    }

  } finally {
    mapper.close();
  }
}

Example 12

Source File: PipesMapRunner.java From big-c with Apache License 2.0

4 votes

/**
 * Run the map task.
 * @param input the set of inputs
 * @param output the object to collect the outputs of the map
 * @param reporter the object to update with status
 */
@SuppressWarnings("unchecked")
public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output,
                Reporter reporter) throws IOException {
  Application<K1, V1, K2, V2> application = null;
  try {
    RecordReader<FloatWritable, NullWritable> fakeInput = 
      (!Submitter.getIsJavaRecordReader(job) && 
       !Submitter.getIsJavaMapper(job)) ? 
 (RecordReader<FloatWritable, NullWritable>) input : null;
    application = new Application<K1, V1, K2, V2>(job, fakeInput, output, 
                                                  reporter,
        (Class<? extends K2>) job.getOutputKeyClass(), 
        (Class<? extends V2>) job.getOutputValueClass());
  } catch (InterruptedException ie) {
    throw new RuntimeException("interrupted", ie);
  }
  DownwardProtocol<K1, V1> downlink = application.getDownlink();
  boolean isJavaInput = Submitter.getIsJavaRecordReader(job);
  downlink.runMap(reporter.getInputSplit(), 
                  job.getNumReduceTasks(), isJavaInput);
  boolean skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false);
  try {
    if (isJavaInput) {
      // allocate key & value instances that are re-used for all entries
      K1 key = input.createKey();
      V1 value = input.createValue();
      downlink.setInputTypes(key.getClass().getName(),
                             value.getClass().getName());
      
      while (input.next(key, value)) {
        // map pair to output
        downlink.mapItem(key, value);
        if(skipping) {
          //flush the streams on every record input if running in skip mode
          //so that we don't buffer other records surrounding a bad record.
          downlink.flush();
        }
      }
      downlink.endOfInput();
    }
    application.waitForFinish();
  } catch (Throwable t) {
    application.abort(t);
  } finally {
    application.cleanup();
  }
}

Example 13

Source File: EventInputFormatTest.java From gemfirexd-oss with Apache License 2.0

4 votes

private void deleteTest( boolean writeOnly, boolean primaryKey, boolean isTransactional) throws Exception {
  //getConnection();
  Connection conn = null;
  if (isTransactional) {
    conn = getTxConnection();//startNetserverAndGetLocalNetConnection();
  } else {
    conn = getConnection();
  }
  
  Statement st = conn.createStatement();
  st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "' batchtimeinterval 2000 milliseconds");
  String primaryKeyString = primaryKey ? "primary key" : "";
  st.execute("create table app.mytab1 (col1 int " + primaryKeyString + ", col2 varchar(100)) BUCKETS 1 persistent hdfsstore (myhdfs) " + (writeOnly? " WRITEONLY " : ""));
  

  PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)");
  for(int i = 0; i < 3; i++) {
    ps.setInt(1, i);
    ps.setString(2, "Value-" + System.nanoTime());
    ps.execute();
  }
  
  st.execute("delete from mytab1 where col1 = 1");
  //Wait for data to get to HDFS...
  String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1");
  st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)");
  
  TestUtil.shutDown();

  FileStatus[] list = null;
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.get(conf);
  for (int i = 0; i < 20; i++) {
    list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/"));
    if (list.length == 1) {
      break;
    }
    Thread.sleep(500);
  }
  if (list.length != 1) {
    fail("unexpected files: " + java.util.Arrays.toString(list));
  }

  conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1");
  conf.set(RowInputFormat.HOME_DIR, HDFS_DIR);
  
  JobConf job = new JobConf(conf);
  job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
  RowInputFormat ipformat = new RowInputFormat();
  InputSplit[] splits = ipformat.getSplits(job, 2);
  RecordReader<Key, Row> rr = ipformat.getRecordReader(splits[0], job, null);
  Key key = rr.createKey();
  Row value = rr.createValue();

  rr.next(key, value); 
  assertEquals(0, value.getRowAsResultSet().getInt("col1"));
  
  if (!writeOnly) {
    rr.next(key, value);
    checkForDeletedRow(value, primaryKey);
  }
  rr.next(key, value); 
  assertEquals(1, value.getRowAsResultSet().getInt("col1"));
  
  rr.next(key, value); 
  assertEquals(2, value.getRowAsResultSet().getInt("col1"));
  
  if (writeOnly) {
    rr.next(key, value);
    checkForDeletedRow(value, primaryKey);
  }
  assertFalse(rr.next(key,  value));
  
  TestUtil.shutDown();
}

Example 14

Source File: GenericHiveRecordCursor.java From presto with Apache License 2.0

4 votes

public GenericHiveRecordCursor(
        Configuration configuration,
        Path path,
        RecordReader<K, V> recordReader,
        long totalBytes,
        Properties splitSchema,
        List<HiveColumnHandle> columns,
        DateTimeZone hiveStorageTimeZone)
{
    requireNonNull(path, "path is null");
    requireNonNull(recordReader, "recordReader is null");
    checkArgument(totalBytes >= 0, "totalBytes is negative");
    requireNonNull(splitSchema, "splitSchema is null");
    requireNonNull(columns, "columns is null");
    requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null");

    this.path = path;
    this.recordReader = recordReader;
    this.totalBytes = totalBytes;
    this.key = recordReader.createKey();
    this.value = recordReader.createValue();
    this.hiveStorageTimeZone = hiveStorageTimeZone;

    this.deserializer = getDeserializer(configuration, splitSchema);
    this.rowInspector = getTableObjectInspector(deserializer);

    int size = columns.size();

    this.types = new Type[size];
    this.hiveTypes = new HiveType[size];

    this.structFields = new StructField[size];
    this.fieldInspectors = new ObjectInspector[size];

    this.loaded = new boolean[size];
    this.booleans = new boolean[size];
    this.longs = new long[size];
    this.doubles = new double[size];
    this.slices = new Slice[size];
    this.objects = new Object[size];
    this.nulls = new boolean[size];

    // initialize data columns
    for (int i = 0; i < columns.size(); i++) {
        HiveColumnHandle column = columns.get(i);
        checkState(column.getColumnType() == REGULAR, "column type must be regular");

        types[i] = column.getType();
        hiveTypes[i] = column.getHiveType();

        StructField field = rowInspector.getStructFieldRef(column.getName());
        structFields[i] = field;
        fieldInspectors[i] = field.getFieldObjectInspector();
    }
}

Example 15

Source File: EventInputFormatTest.java From gemfirexd-oss with Apache License 2.0

4 votes

public void testNoSecureHdfsCheck() throws Exception {
  getConnection();
  Connection conn = startNetserverAndGetLocalNetConnection();
  
  
  Statement st = conn.createStatement();
  st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "'  batchtimeinterval 5000 milliseconds");
  st.execute("create table app.mytab1 (col1 int primary key, col2 varchar(100)) persistent hdfsstore (myhdfs) BUCKETS 1");
  
  PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)");
  int NUM_ENTRIES = 20;
  for(int i = 0; i < NUM_ENTRIES; i++) {
    ps.setInt(1, i);
    ps.setString(2, "Value-" + System.nanoTime());
    ps.execute();
  }
  //Wait for data to get to HDFS...
  String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1");
  st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)");
  
  stopNetServer();
  FabricServiceManager.currentFabricServiceInstance().stop(new Properties());
  
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.get(conf);
  FileStatus[] list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/"));
  assertEquals(1, list.length);
  
  conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1");
  conf.set(RowInputFormat.HOME_DIR, HDFS_DIR);
  conf.set("hadoop.security.authentication", "kerberos");
  
  JobConf job = new JobConf(conf);
  job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
  RowInputFormat ipformat = new RowInputFormat();
  InputSplit[] splits = ipformat.getSplits(job, 2);
  assertEquals(1, splits.length);
  CombineFileSplit split = (CombineFileSplit) splits[0];
  assertEquals(1, split.getPaths().length);
  assertEquals(list[0].getPath().toString(), split.getPath(0).toString());
  assertEquals(0, split.getOffset(0));
  assertEquals(list[0].getLen(), split.getLength(0));
  
  RecordReader<Key, Row> rr = ipformat.getRecordReader(split, job, null);
  Key key = rr.createKey();
  Row value = rr.createValue();
  
  int count = 0;
  while (rr.next(key, value)) {
    assertEquals(count++, value.getRowAsResultSet().getInt("col1"));
  }
  
  assertEquals(20, count);
  
  TestUtil.shutDown();
}

Example 16

Source File: TestHoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

4 votes

@Test
@Disabled
public void testHoodieRealtimeCombineHoodieInputFormat() throws Exception {

  Configuration conf = new Configuration();
  // initial commit
  Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
  HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
  String commitTime = "100";
  final int numRecords = 1000;
  // Create 3 parquet files with 1000 records each
  File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime);
  InputFormatTestUtil.commit(tempDir, commitTime);

  // insert 1000 update records to log file 0
  String newCommitTime = "101";
  HoodieLogFormat.Writer writer =
      InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", commitTime, newCommitTime,
          numRecords, numRecords, 0);
  writer.close();
  // insert 1000 update records to log file 1
  writer =
      InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", commitTime, newCommitTime,
          numRecords, numRecords, 0);
  writer.close();
  // insert 1000 update records to log file 2
  writer =
      InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime,
          numRecords, numRecords, 0);
  writer.close();

  TableDesc tblDesc = Utilities.defaultTd;
  // Set the input format
  tblDesc.setInputFileFormatClass(HoodieCombineHiveInputFormat.class);
  PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
  LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
  pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
  MapredWork mrwork = new MapredWork();
  mrwork.getMapWork().setPathToPartitionInfo(pt);
  Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
  Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
  jobConf = new JobConf(conf);
  // Add the paths
  FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
  jobConf.set(HAS_MAP_WORK, "true");
  // The following config tells Hive to choose ExecMapper to read the MAP_WORK
  jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
  // setting the split size to be 3 to create one split for 3 file groups
  jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "3");

  HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
  String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
  InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
  InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
  // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
  assertEquals(1, splits.length);
  RecordReader<NullWritable, ArrayWritable> recordReader =
      combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
  NullWritable nullWritable = recordReader.createKey();
  ArrayWritable arrayWritable = recordReader.createValue();
  int counter = 0;
  while (recordReader.next(nullWritable, arrayWritable)) {
    // read over all the splits
    counter++;
  }
  // should read out 3 splits, each for file0, file1, file2 containing 1000 records each
  assertEquals(3000, counter);
}

Example 17

Source File: GFXDHiveInputFormatTest.java From gemfirexd-oss with Apache License 2.0

4 votes

public void testHiveInputFormat() throws Exception {
  getConnection();
  Connection conn = startNetserverAndGetLocalNetConnection();
  
  Statement st = conn.createStatement();
  st.execute("create hdfsstore myhdfs namenode 'localhost' homedir '" + HDFS_DIR + "' batchtimeinterval 5000 milliseconds");
  st.execute("create table app.mytab1 (col1 int primary key, col2 varchar(100)) persistent hdfsstore (myhdfs) BUCKETS 1");

  PreparedStatement ps = conn.prepareStatement("insert into mytab1 values (?, ?)");
  int NUM_ENTRIES = 20;
  for(int i = 0; i < NUM_ENTRIES; i++) {
    ps.setInt(1, i);
    ps.setString(2, "Value-" + System.nanoTime());
    ps.execute();
  }
  //Wait for data to get to HDFS...
  String qname = HDFSStoreFactoryImpl.getEventQueueName("/APP/MYTAB1");
  st.execute("CALL SYS.WAIT_FOR_SENDER_QUEUE_FLUSH('" + qname + "', 1, 0)");
  
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.get(conf);
  FileStatus[] list = fs.listStatus(new Path(HDFS_DIR + "/APP_MYTAB1/0/"));
  assertEquals(1, list.length);
  
  conf.set(RowInputFormat.INPUT_TABLE, "MYTAB1");
  conf.set(RowInputFormat.HOME_DIR, HDFS_DIR);
  
  JobConf job = new JobConf(conf);
  job.setBoolean(RowInputFormat.CHECKPOINT_MODE, false);
  GFXDHiveInputFormat ipformat = new GFXDHiveInputFormat();
  InputSplit[] splits = ipformat.getSplits(job, 2);
  assertEquals(1, splits.length);
  GFXDHiveSplit split = (GFXDHiveSplit) splits[0];
  assertEquals(1, split.getPaths().length);
  assertEquals(list[0].getPath().toString(), split.getPath(0).toString());
  assertEquals(0, split.getOffset(0));
  assertEquals(list[0].getLen(), split.getLength(0));

  RecordReader<Key, Row> rr = ipformat.getRecordReader(split, job, null);
  assertTrue("Row record reader should be an instace of GFXDHiveRowRecordReader " +
  		"but it is an instance of " + rr.getClass(),  (rr instanceof GFXDHiveRowRecordReader));
  Key key = rr.createKey();
  Row value = rr.createValue();

  int count = 0;
  while (rr.next(key, value)) {
    assertEquals(count++, value.getRowAsResultSet().getInt("col1"));
  }
  
  assertEquals(20, count);
  
  TestUtil.shutDown();
}

Example 18

Source File: InputSampler.java From hadoop with Apache License 2.0

4 votes

/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, JobConf job) throws IOException {
  InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks());
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.length);

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.length; ++i) {
    InputSplit tmp = splits[i];
    int j = r.nextInt(splits.length);
    splits[i] = splits[j];
    splits[j] = tmp;
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.length && samples.size() < numSamples); ++i) {
    RecordReader<K,V> reader = inf.getRecordReader(splits[i], job,
        Reporter.NULL);
    K key = reader.createKey();
    V value = reader.createValue();
    while (reader.next(key, value)) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(key);
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, key);
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
        key = reader.createKey();
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Example 19

Source File: MultithreadedMapRunner.java From hadoop with Apache License 2.0

4 votes

public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output,
                Reporter reporter)
  throws IOException {
  try {
    // allocate key & value instances these objects will not be reused
    // because execution of Mapper.map is not serialized.
    K1 key = input.createKey();
    V1 value = input.createValue();

    while (input.next(key, value)) {

      executorService.execute(new MapperInvokeRunable(key, value, output,
                              reporter));

      checkForExceptionsFromProcessingThreads();

      // Allocate new key & value instances as mapper is running in parallel
      key = input.createKey();
      value = input.createValue();
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug("Finished dispatching all Mappper.map calls, job "
                + job.getJobName());
    }

    // Graceful shutdown of the Threadpool, it will let all scheduled
    // Runnables to end.
    executorService.shutdown();

    try {

      // Now waiting for all Runnables to end.
      while (!executorService.awaitTermination(100, TimeUnit.MILLISECONDS)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Awaiting all running Mappper.map calls to finish, job "
                    + job.getJobName());
        }

        // NOTE: while Mapper.map dispatching has concluded there are still
        // map calls in progress and exceptions would be thrown.
        checkForExceptionsFromProcessingThreads();

      }

      // NOTE: it could be that a map call has had an exception after the
      // call for awaitTermination() returing true. And edge case but it
      // could happen.
      checkForExceptionsFromProcessingThreads();

    } catch (IOException ioEx) {
      // Forcing a shutdown of all thread of the threadpool and rethrowing
      // the IOException
      executorService.shutdownNow();
      throw ioEx;
    } catch (InterruptedException iEx) {
      throw new RuntimeException(iEx);
    }

  } finally {
    mapper.close();
  }
}

Example 20

Source File: PipesMapRunner.java From hadoop with Apache License 2.0

4 votes

/**
 * Run the map task.
 * @param input the set of inputs
 * @param output the object to collect the outputs of the map
 * @param reporter the object to update with status
 */
@SuppressWarnings("unchecked")
public void run(RecordReader<K1, V1> input, OutputCollector<K2, V2> output,
                Reporter reporter) throws IOException {
  Application<K1, V1, K2, V2> application = null;
  try {
    RecordReader<FloatWritable, NullWritable> fakeInput = 
      (!Submitter.getIsJavaRecordReader(job) && 
       !Submitter.getIsJavaMapper(job)) ? 
 (RecordReader<FloatWritable, NullWritable>) input : null;
    application = new Application<K1, V1, K2, V2>(job, fakeInput, output, 
                                                  reporter,
        (Class<? extends K2>) job.getOutputKeyClass(), 
        (Class<? extends V2>) job.getOutputValueClass());
  } catch (InterruptedException ie) {
    throw new RuntimeException("interrupted", ie);
  }
  DownwardProtocol<K1, V1> downlink = application.getDownlink();
  boolean isJavaInput = Submitter.getIsJavaRecordReader(job);
  downlink.runMap(reporter.getInputSplit(), 
                  job.getNumReduceTasks(), isJavaInput);
  boolean skipping = job.getBoolean(MRJobConfig.SKIP_RECORDS, false);
  try {
    if (isJavaInput) {
      // allocate key & value instances that are re-used for all entries
      K1 key = input.createKey();
      V1 value = input.createValue();
      downlink.setInputTypes(key.getClass().getName(),
                             value.getClass().getName());
      
      while (input.next(key, value)) {
        // map pair to output
        downlink.mapItem(key, value);
        if(skipping) {
          //flush the streams on every record input if running in skip mode
          //so that we don't buffer other records surrounding a bad record.
          downlink.flush();
        }
      }
      downlink.endOfInput();
    }
    application.waitForFinish();
  } catch (Throwable t) {
    application.abort(t);
  } finally {
    application.cleanup();
  }
}