org.kitesdk.data.DatasetReader Java Examples

The following examples show how to use org.kitesdk.data.DatasetReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DatasetTestUtilities.java    From kite with Apache License 2.0 6 votes vote down vote up
public static <R> void checkReaderBehavior(
    DatasetReader<R> reader, int totalRecords, RecordValidator<R> validator) {
  // this is now used for both initialized and not initialized records because
  // initialization now happens automatically in newReader
  if (!reader.isOpen() && reader instanceof InitializeAccessor) {
    ((InitializeAccessor) reader).initialize();
  }

  try {
    Assert.assertTrue("Reader should be open", reader.isOpen());

    checkReaderIteration(reader, totalRecords, validator);

  } finally {
    reader.close();
  }

  Assert.assertFalse("Reader is open after close()", reader.isOpen());
}
 
Example #2
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example #3
Source File: ReadMovies.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  Dataset<Record> movies = Datasets.load(
      "dataset:hdfs:/tmp/data/movies", Record.class);

  DatasetReader<Record> reader = null;
  try {
    reader = movies.newReader();
    for (Record rec : reader) {
      System.err.println("Movie: " + rec);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example #4
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example #5
Source File: ReadProductDatasetPojo.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the products dataset
  Dataset<Product> products = Datasets.load(
      "dataset:hdfs:/tmp/data/products", Product.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Product> reader = null;
  try {
    reader = products.newReader();
    for (Product product : reader) {
      System.out.println(product);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example #6
Source File: ReadUserDatasetGenericOnePartition.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the users dataset
  Dataset<Record> users = Datasets.load(
      "dataset:hdfs:/tmp/data/users", Record.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Record> reader = null;
  try {
    reader = users.with("favoriteColor", "green").newReader();
    for (GenericRecord user : reader) {
      System.out.println(user);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example #7
Source File: ReadHiveUserDatasetGeneric.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the users dataset
  Dataset<Record> users = Datasets.load(
      "dataset:hive?dataset=users", Record.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Record> reader = null;
  try {
    reader = users.newReader();
    for (GenericRecord user : users.newReader()) {
      System.out.println(user);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example #8
Source File: ReadUserDatasetGeneric.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the users dataset
  Dataset<Record> users = Datasets.load(
      "dataset:hdfs:/tmp/data/users", Record.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Record> reader = null;
  try {
    reader = users.newReader();
    for (GenericRecord user : reader) {
      System.out.println(user);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example #9
Source File: TestMapReduce.java    From kite with Apache License 2.0 6 votes vote down vote up
private void checkOutput(boolean existingPresent) {
  DatasetReader<GenericData.Record> reader = outputDataset.newReader();
  Map<String, Integer> counts = new HashMap<String, Integer>();
  for (GenericData.Record record : reader) {
    counts.put(record.get("name").toString(), (Integer) record.get("count"));
  }
  reader.close();

  Assert.assertEquals(3, counts.get("apple").intValue());
  Assert.assertEquals(2, counts.get("banana").intValue());
  Assert.assertEquals(1, counts.get("carrot").intValue());
  if (existingPresent) {
    Assert.assertEquals(4, counts.get("date").intValue());
  } else {
    Assert.assertNull(counts.get("date"));
  }
}
 
Example #10
Source File: UserProfileDatasetExample.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Print the user profiles and actions for all users with the provided last
 * name
 * 
 * This method demonstrates how to open a scanner with a start key. It's using
 * the composite dao, so the records it returns will be a composite of both
 * the profile model and actions model.
 * 
 * @param lastName
 *          The last name of users to scan.
 */
public void printUserProfileActionsForLastName(String lastName) {
  // TODO: use a reader with a start key
  DatasetReader<UserProfileActionsModel2> reader = userProfileActionsDataset.newReader();
  try {
    for (UserProfileActionsModel2 entity : reader) {
      UserProfileModel2 userProfile = entity.getUserProfileModel();
      if (userProfile.getLastName().equals(lastName)) {
        System.out.println(entity.toString());
      }
    }
  } finally {
    // readers need to be closed.
    reader.close();
  }
}
 
Example #11
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testIncrementalParquetImport() throws IOException, SQLException {
  String [] types = { "INT" };
  String [] vals = { "1" };
  createTableWithColTypes(types, vals);

  runImport(getOutputArgv(true, null));
  runImport(getOutputArgv(true, new String[]{"--append"}));

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals(1, record1.get("DATA_COL0"));
    record1 = reader.next();
    assertEquals(1, record1.get("DATA_COL0"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example #12
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testQueryImport() throws IOException, SQLException {
  String [] types = { "INT" };
  String [] vals = { "1" };
  createTableWithColTypes(types, vals);

  runImport(getOutputQueryArgv(true, null));

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals(1, record1.get("DATA_COL0"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example #13
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testNullableParquetImport() throws IOException, SQLException {
  String [] types = { "INT" };
  String [] vals = { null };
  createTableWithColTypes(types, vals);

  runImport(getOutputArgv(true, null));

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertNull(record1.get("DATA_COL0"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example #14
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testNonIdentCharactersInColumnName() throws IOException {
  String [] names = { "test_p-a+r/quet" };
  String [] types = { "INT" };
  String [] vals = { "2015" };
  createTableWithColTypesAndNames(names, types, vals);

  runImport(getOutputArgv(true, null));

  Schema schema = getSchema();
  assertEquals(Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());
  checkField(fields.get(0), "TEST_P_A_R_QUET", Type.INT);

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals("TEST_P_A_R_QUET", 2015, record1.get("TEST_P_A_R_QUET"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example #15
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testFirstUnderscoreInColumnName() throws IOException {
  String [] names = { "_NAME" };
  String [] types = { "INT" };
  String [] vals = { "1987" };
  createTableWithColTypesAndNames(names, types, vals);

  runImport(getOutputArgv(true, null));

  Schema schema = getSchema();
  assertEquals(Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());
  checkField(fields.get(0), "__NAME", Type.INT);

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals("__NAME", 1987, record1.get("__NAME"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example #16
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testOverrideTypeMapping() throws IOException {
  String [] types = { "INT" };
  String [] vals = { "10" };
  createTableWithColTypes(types, vals);

  String [] extraArgs = { "--map-column-java", "DATA_COL0=String"};
  runImport(getOutputArgv(true, extraArgs));

  Schema schema = getSchema();
  assertEquals(Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());
  checkField(fields.get(0), "DATA_COL0", Type.STRING);

  DatasetReader<GenericRecord> reader = getReader();
  try {
    assertTrue(reader.hasNext());
    GenericRecord record1 = reader.next();
    assertEquals("DATA_COL0", "10", record1.get("DATA_COL0"));
    assertFalse(reader.hasNext());
  } finally {
    reader.close();
  }
}
 
Example #17
Source File: TestInputFormatValueReader.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public DatasetReader<Text> newReader() throws IOException {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .property(InputFormatUtil.INPUT_FORMAT_CLASS_PROP,
          "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
      .property(InputFormatUtil.INPUT_FORMAT_RECORD_PROP, "value")
      .schema(Schema.create(Schema.Type.STRING))
      .build();
  return new InputFormatReader<Text>(localfs, userFile, descriptor);
}
 
Example #18
Source File: DatasetTestUtilities.java    From kite with Apache License 2.0 5 votes vote down vote up
public static <E> Set<E> materialize(View<E> ds) {
  Set<E> records = Sets.newHashSet();
  DatasetReader<E> reader = null;
  try {
    reader = ds.newReader();
    for (E record : reader) {
      records.add(record);
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
  return records;
}
 
Example #19
Source File: TestFileSystemDataset.java    From kite with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
private int readTestUsersInPartition(FileSystemDataset<Record> ds, PartitionKey key,
    String subpartitionName) {
  int readCount = 0;
  DatasetReader<Record> reader = null;
  try {
    PartitionedDataset<Record> partition = ds.getPartition(key, false);
    if (subpartitionName != null) {
      List<FieldPartitioner> fieldPartitioners =
          Accessor.getDefault().getFieldPartitioners(partition.getDescriptor()
              .getPartitionStrategy());
      Assert.assertEquals(1, fieldPartitioners.size());
      Assert.assertEquals(subpartitionName, fieldPartitioners.get(0)
          .getName());
    }
    reader = partition.newReader();
    for (GenericData.Record actualRecord : reader) {
      Assert.assertEquals(actualRecord.toString(), key.get(0), (actualRecord
          .get("username").hashCode() & Integer.MAX_VALUE) % 2);
      if (key.getLength() > 1) {
        Assert.assertEquals(key.get(1),
            (actualRecord.get("email").hashCode() & Integer.MAX_VALUE) % 3);
      }
      readCount++;
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
  return readCount;
}
 
Example #20
Source File: TestFileSystemDatasetReader.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public DatasetReader<Record> newReader() throws IOException {
  return new FileSystemDatasetReader<Record>(
      LocalFileSystem.getInstance(),
      new Path(Resources.getResource("data/strings-100.avro").getFile()),
      STRING_SCHEMA, Record.class);
}
 
Example #21
Source File: TestMultiFileDatasetReader.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public DatasetReader newReader() throws IOException {
  return new MultiFileDatasetReader<Record>(
      FileSystem.get(new Configuration()),
      Lists.newArrayList(TEST_FILE, TEST_FILE),
      DESCRIPTOR, CONSTRAINTS, ACCESSOR);
}
 
Example #22
Source File: TestCSVFileReader.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public DatasetReader<GenericData.Record> newReader() throws IOException {
  final DatasetDescriptor desc = new DatasetDescriptor.Builder()
      .property("kite.csv.has-header", "true")
      .schema(VALIDATOR_SCHEMA)
      .build();
  return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc,
      DataModelUtil.accessor(GenericData.Record.class, desc.getSchema()));
}
 
Example #23
Source File: AbstractRefinableView.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public boolean isEmpty() {
  DatasetReader<E> reader = null;
  try {
    // use a reader because files may be present but empty
    reader = newReader();
    return !reader.hasNext();
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
}
 
Example #24
Source File: DatasetTestUtilities.java    From kite with Apache License 2.0 5 votes vote down vote up
public static <R> void checkReaderIteration(DatasetReader<R> reader,
    int expectedRecordCount, RecordValidator<R> validator) {
  int recordCount = 0;

  Assert.assertTrue("Reader is not open", reader.isOpen());
  Assert.assertTrue("Reader has no records, expected " + expectedRecordCount,
      (expectedRecordCount == 0) || reader.hasNext());

  for (R record : reader) {
    // add calls to hasNext, which should not affect the iteration
    validator.validate(record, recordCount);
    Assert.assertNotNull(record);
    reader.hasNext();
    recordCount++;
  }

  Assert.assertFalse("Reader is empty, but hasNext is true",
      reader.hasNext());

  // verify that NoSuchElementException is thrown when hasNext returns false
  try {
    reader.next();
    Assert.fail("Reader did not throw NoSuchElementException");
  } catch (NoSuchElementException ex) {
    // this is the correct behavior
  }

  Assert.assertTrue("Reader is empty, but should be open", reader.isOpen());

  // verify the correct number of records were produced
  // if hasNext advances the reader, then this will be wrong
  Assert.assertEquals("Incorrect number of records",
      expectedRecordCount, recordCount);
}
 
Example #25
Source File: TestInputFormatKeyReader.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public DatasetReader<LongWritable> newReader() throws IOException {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .property(InputFormatUtil.INPUT_FORMAT_CLASS_PROP,
          "org.apache.hadoop.mapreduce.lib.input.TextInputFormat")
      .property(InputFormatUtil.INPUT_FORMAT_RECORD_PROP, "key")
      .schema(Schema.create(Schema.Type.LONG))
      .build();
  return new InputFormatReader<LongWritable>(localfs, userFile, descriptor);
}
 
Example #26
Source File: TestAvroWriter.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testCommitFlushedRecords() throws IOException {
  init(fsWriter);

  List<Record> written = Lists.newArrayList();
  long i;
  for (i = 0; i < 10000; i += 1) {
    Record record = record(i, "test-" + i);
    fsWriter.write(record);
    written.add(record);
  }

  ((Flushable) fsWriter).flush();

  for (i = 10000; i < 11000; i += 1) {
    fsWriter.write(record(i, "test-" + i));
  }

  // put the writer into an error state, simulating either:
  // 1. A failed record with an IOException or unknown RuntimeException
  // 2. A failed flush or sync for IncrementableWriters
  fsWriter.state = ReaderWriterState.ERROR;

  fsWriter.close();

  FileStatus[] stats = fs.listStatus(testDirectory, PathFilters.notHidden());
  Assert.assertEquals("Should contain a visible data file", 1, stats.length);

  DatasetReader<Record> reader = newReader(stats[0].getPath(), TEST_SCHEMA);
  Assert.assertEquals("Should match written records",
      written, Lists.newArrayList((Iterator) init(reader)));
}
 
Example #27
Source File: UserProfileDatasetExample.java    From kite with Apache License 2.0 5 votes vote down vote up
/**
 * Print all user profiles.
 * 
 * This method demonstrates how to open a reader that will read the entire
 * table. It has no start or stop keys specified.
 */
public void printUserProfies() {
  DatasetReader<UserProfileModel2> reader = userProfileDataset.newReader();
  try {
    for (UserProfileModel2 userProfile : reader) {
      System.out.println(userProfile.toString());
    }
  } finally {
    // readers need to be closed.
    reader.close();
  }
}
 
Example #28
Source File: DaoViewTest.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testRange() {
  populateTestEntities(10);

  final AbstractRefinableView<TestEntity> range = new DaoView<TestEntity>(ds, TestEntity.class)
          .fromAfter(NAMES[0], "1").to(NAMES[0], "9")
          .fromAfter(NAMES[1], "1").to(NAMES[1], "9");

  // Test entity range checks
  // Note that these are strings, not ints, so lexicographic ordering is used
  Assert.assertTrue(range.includes(newTestEntity("5", "5")));
  Assert.assertTrue(range.includes(newTestEntity("5", "55")));
  Assert.assertTrue(range.includes(newTestEntity("9", "89")));
  Assert.assertTrue(range.includes(newTestEntity("9", "9")));
  Assert.assertFalse(range.includes(newTestEntity("1", "1")));
  Assert.assertFalse(range.includes(newTestEntity("1", "0")));
  Assert.assertFalse(range.includes(newTestEntity("1", "10")));
  Assert.assertFalse(range.includes(newTestEntity("9", "99")));

  DatasetReader<TestEntity> reader = range.newReader();
  int cnt = 2;
  try {
    for (TestEntity entity : reader) {
      Assert.assertEquals(Integer.toString(cnt), entity.getPart1());
      Assert.assertEquals(Integer.toString(cnt), entity.getPart2());
      cnt++;
    }
  } finally {
    reader.close();
  }

  Assert.assertEquals(10, cnt);
}
 
Example #29
Source File: DaoViewTest.java    From kite with Apache License 2.0 5 votes vote down vote up
private void validRange(View<TestEntity> range, int startIdx, int endIdx) {
  int cnt = startIdx;
  DatasetReader<TestEntity> reader = range.newReader();
  try {
    for (TestEntity entity : reader) {
      Assert.assertEquals(Integer.toString(cnt), entity.getPart1());
      Assert.assertEquals(Integer.toString(cnt), entity.getPart2());
      cnt++;
    }
  } finally {
    reader.close();
  }
  Assert.assertEquals(endIdx, cnt);
}
 
Example #30
Source File: TestHBaseActionModifiable.java    From kite with Apache License 2.0 5 votes vote down vote up
private void checkRecord(boolean shouldExist) {
  DatasetReader<TestEntity> dsReader = ds.newReader();
  try {
    if (shouldExist) {
      assertTrue(dsReader.hasNext());
    } else {
      assertFalse(dsReader.hasNext());
    }
  } finally {
    dsReader.close();
  }
}