Java Code Examples for org.pentaho.di.core.RowMetaAndData#setRowMeta()

The following examples show how to use org.pentaho.di.core.RowMetaAndData#setRowMeta() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CommonFormatShimTest.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Test
public void testAvroNestedReadLocalFileSystem() throws Exception {
  List<String> expectedRows = Arrays.asList( "John;4074549921", "Leslie;4079302194" );
  PentahoAvroInputFormat avroInputFormat = new PentahoAvroInputFormat( mock( NamedCluster.class ) );

  avroInputFormat.setInputSchemaFile( getFilePath( "/sample-schema.avro" ) );
  avroInputFormat.setInputFile( getFilePath( "/sample-data.avro" ) );
  avroInputFormat.setUseFieldAsInputStream( false );
  avroInputFormat.setIsDataBinaryEncoded( true );
  List<AvroInputField> inputFields = new ArrayList<>();

  addStringField( inputFields, "FirstName" );
  addStringField( inputFields, "Phone" );
  avroInputFormat.setInputFields( inputFields );

  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "FirstName" ) );
  rowMeta.addValueMeta( new ValueMetaString( "Phone" ) );
  row.setRowMeta( rowMeta );

  avroInputFormat.setOutputRowMeta( rowMeta );

  IPentahoRecordReader recordReader = avroInputFormat.createRecordReader( null );
  assertEquals( expectedRows, generateDataSample( recordReader, inputFields ) );
}
 
Example 2
Source File: CommonFormatShimTest.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvroArrayAndMapComplexTypes() throws Exception {

  List<String> expectedRows = Arrays.asList( "string1;string2;string4", "string101;string102;string104" );

  PentahoAvroInputFormat avroInputFormat = new PentahoAvroInputFormat( mock( NamedCluster.class ) );
  avroInputFormat.setInputFile( getFilePath( "/sampledata1.avro" ) );
  avroInputFormat.setUseFieldAsInputStream( false );
  avroInputFormat.setIsDataBinaryEncoded( true );
  List<AvroInputField> inputFields = new ArrayList<>();

  addStringField( inputFields, "parentString" );
  addStringField( inputFields, "parentStringMap[key1]" );
  addStringField( inputFields, "parentStringArray[0]" );
  avroInputFormat.setInputFields( inputFields );

  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "parentString" ) );
  rowMeta.addValueMeta( new ValueMetaString( "parentStringMap[key1]" ) );
  rowMeta.addValueMeta( new ValueMetaString( "parentStringArray[0]" ) );
  row.setRowMeta( rowMeta );

  avroInputFormat.setOutputRowMeta( rowMeta );

  IPentahoRecordReader recordReader = avroInputFormat.createRecordReader( null );
  assertEquals( expectedRows, generateDataSample( recordReader, inputFields ) );
}
 
Example 3
Source File: CommonFormatShimTest.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvroDatumReadFromField() throws Exception {
  List<String> expectedRows = singletonList( "1;string1;string6" );

  PentahoAvroInputFormat avroInputFormat = new PentahoAvroInputFormat( mock( NamedCluster.class ) );
  avroInputFormat.setDatum( true );
  avroInputFormat.setUseFieldAsInputStream( true );
  avroInputFormat.setUseFieldAsSchema( true );
  avroInputFormat.setInputStreamFieldName( "data" );
  avroInputFormat.setSchemaFieldName( "schema" );
  avroInputFormat.setIncomingFields( new Object[] { jsonDatumData, jsonSchema } );
  avroInputFormat.setIsDataBinaryEncoded( false );
  List<AvroInputField> inputFields = new ArrayList<>();

  addStringField( inputFields, "parentInt" );
  addStringField( inputFields, "parentString" );
  addStringField( inputFields, "childData.childString" );

  avroInputFormat.setInputFields( inputFields );

  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "parentInt" ) );
  rowMeta.addValueMeta( new ValueMetaString( "parentString" ) );
  rowMeta.addValueMeta( new ValueMetaString( "childData.childString" ) );
  row.setRowMeta( rowMeta );
  avroInputFormat.setOutputRowMeta( rowMeta );

  RowMeta inRowMeta = new RowMeta();
  inRowMeta.addValueMeta( new ValueMetaString( "data" ) );
  inRowMeta.addValueMeta( new ValueMetaString( "schema" ) );
  avroInputFormat.setIncomingRowMeta( inRowMeta );

  IPentahoRecordReader recordReader = avroInputFormat.createRecordReader( null );

  assertEquals( expectedRows, generateDataSample( recordReader, inputFields ) );
}
 
Example 4
Source File: CommonFormatShimTest.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvroJsonDatumReadFromFile() throws Exception {
  String tempDir = Files.createTempDirectory( "avro" ).toString();
  String datumFile = tempDir + File.separator + "datum";
  FileUtils.writeStringToFile( new File( datumFile ), jsonDatumData, "utf8" );

  List<String> expectedRows = singletonList( "1;string1;string6" );

  PentahoAvroInputFormat avroInputFormat = new PentahoAvroInputFormat( mock( NamedCluster.class ) );
  avroInputFormat.setDatum( true );
  avroInputFormat.setUseFieldAsInputStream( false );
  avroInputFormat.setUseFieldAsSchema( true );
  avroInputFormat.setInputFile( datumFile );
  avroInputFormat.setSchemaFieldName( "schema" );
  avroInputFormat.setIncomingFields( new Object[] { jsonSchema } );
  avroInputFormat.setIsDataBinaryEncoded( false );
  List<AvroInputField> inputFields = new ArrayList<>();

  addStringField( inputFields, "parentInt" );
  addStringField( inputFields, "parentString" );
  addStringField( inputFields, "childData.childString" );

  avroInputFormat.setInputFields( inputFields );

  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "parentInt" ) );
  rowMeta.addValueMeta( new ValueMetaString( "parentString" ) );
  rowMeta.addValueMeta( new ValueMetaString( "childData.childString" ) );
  row.setRowMeta( rowMeta );
  avroInputFormat.setOutputRowMeta( rowMeta );
  RowMeta inRowMeta = new RowMeta();
  inRowMeta.addValueMeta( new ValueMetaString( "schema" ) );
  avroInputFormat.setIncomingRowMeta( inRowMeta );

  IPentahoRecordReader recordReader = avroInputFormat.createRecordReader( null );

  assertEquals( expectedRows, generateDataSample( recordReader, inputFields ) );

}
 
Example 5
Source File: CommonFormatShimTest.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvroBinaryDatumReadFromFile() throws Exception {
  List<String> expectedRows = singletonList( "1;aString" );

  PentahoAvroInputFormat avroInputFormat = new PentahoAvroInputFormat( mock( NamedCluster.class ) );
  avroInputFormat.setDatum( true );
  avroInputFormat.setUseFieldAsInputStream( false );
  avroInputFormat.setUseFieldAsSchema( false );
  avroInputFormat.setInputFile( getFilePath( "/avro/flatschema.datum" ) );
  avroInputFormat.setInputSchemaFile( getFilePath( "/avro/flatschema.avsc" ) );
  avroInputFormat.setIsDataBinaryEncoded( true );
  avroInputFormat.setIncomingFields( new Object[] {} );

  List<AvroInputField> inputFields = new ArrayList<>();
  addStringField( inputFields, "parentInt" );
  addStringField( inputFields, "parentString" );

  avroInputFormat.setInputFields( inputFields );

  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "parentInt" ) );
  rowMeta.addValueMeta( new ValueMetaString( "parentString" ) );
  row.setRowMeta( rowMeta );
  avroInputFormat.setOutputRowMeta( rowMeta );

  IPentahoRecordReader recordReader = avroInputFormat.createRecordReader( null );

  assertEquals( expectedRows, generateDataSample( recordReader, inputFields ) );

}
 
Example 6
Source File: CommonFormatShimTest.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
@Test
public void testAvroBinaryDatumReadFromField() throws Exception {
  List<String> expectedRows = singletonList( "1;aString" );
  byte[] datumBytes = Files.readAllBytes( new File( getFilePath( "/avro/flatschema.datum" ) ).toPath() );

  PentahoAvroInputFormat avroInputFormat = new PentahoAvroInputFormat( mock( NamedCluster.class ) );
  avroInputFormat.setDatum( true );
  avroInputFormat.setUseFieldAsInputStream( true );
  avroInputFormat.setInputStreamFieldName( "binaryData" );
  avroInputFormat.setUseFieldAsSchema( false );

  avroInputFormat.setInputSchemaFile( getFilePath( "/avro/flatschema.avsc" ) );

  avroInputFormat.setIsDataBinaryEncoded( true );
  avroInputFormat.setIncomingFields( new Object[] { datumBytes } );

  List<AvroInputField> inputFields = new ArrayList<>();
  addStringField( inputFields, "parentInt" );
  addStringField( inputFields, "parentString" );

  avroInputFormat.setInputFields( inputFields );

  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "parentInt" ) );
  rowMeta.addValueMeta( new ValueMetaString( "parentString" ) );
  row.setRowMeta( rowMeta );
  avroInputFormat.setOutputRowMeta( rowMeta );
  RowMeta inRowMeta = new RowMeta();
  inRowMeta.addValueMeta( new ValueMetaBinary( "binaryData" ) );

  avroInputFormat.setIncomingRowMeta( inRowMeta );
  IPentahoRecordReader recordReader = avroInputFormat.createRecordReader( null );

  assertEquals( expectedRows, generateDataSample( recordReader, inputFields ) );

}
 
Example 7
Source File: MetaInjectTest.java    From pentaho-kettle with Apache License 2.0 5 votes vote down vote up
private static RowMetaAndData createRowMetaAndData( ValueMetaInterface valueMeta, Object data ) {
  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( valueMeta );
  row.setRowMeta( rowMeta );
  row.setData( new Object[] { data } );
  return row;
}
 
Example 8
Source File: CommonFormatShimTest.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Test
public void testParquetWriteSuccessLocalFileSystem() throws Exception {
  final String PARQUET_FILE_NAME = "test.parquet";

  String tempFile = Files.createTempDirectory( "parquet" ).toUri().toString();

  ConfigurationProxy jobConfiguration = new ConfigurationProxy();
  jobConfiguration.set( FileOutputFormat.OUTDIR, tempFile );

  String parquetFilePath = jobConfiguration.get( FileOutputFormat.OUTDIR ) + PARQUET_FILE_NAME;

  IPentahoParquetOutputFormat pentahoParquetOutputFormat = null;
  switch ( provider ) {
    case "APACHE":
      pentahoParquetOutputFormat = new PentahoApacheOutputFormat();
      break;
    case "TWITTER":
      pentahoParquetOutputFormat = new PentahoTwitterOutputFormat();
      break;
    default:
      org.junit.Assert.fail( "Invalid provider name used." );
  }

  pentahoParquetOutputFormat.setOutputFile( parquetFilePath, true );

  pentahoParquetOutputFormat.setFields( ParquetUtils.createOutputFields( ParquetSpec.DataType.INT_64 ) );

  IPentahoRecordWriter recordWriter = pentahoParquetOutputFormat.createRecordWriter();
  RowMetaAndData rowInput = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "Name" ) );
  rowMeta.addValueMeta( new ValueMetaString( "Age" ) );
  rowInput.setRowMeta( rowMeta );

  rowInput.setData( new Object[] { "Andrey", "11" } );

  recordWriter.write( rowInput );
  recordWriter.close();

  IPentahoRecordReader recordReader = readCreatedParquetFile( parquetFilePath );

  Object[] rowInputArr =
    new Object[] { rowInput.getData()[ 0 ].toString(), Long.parseLong( rowInput.getData()[ 1 ].toString() ) };

  recordReader.forEach(
    rowMetaAndData -> org.junit.Assert.assertArrayEquals( rowMetaAndData.getData(), rowInputArr ) );
}
 
Example 9
Source File: PentahoParquetRecordWriterTest.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Test
public void recordWriterCreateFile() throws Exception {

  IPentahoOutputFormat.IPentahoRecordWriter writer = null;
  Object recordWriterObject = null;

  switch ( provider ) {
    case "APACHE":
      org.apache.parquet.hadoop.api.WriteSupport apacheSupport =
        new org.pentaho.hadoop.shim.common.format.parquet.delegate.apache.PentahoParquetWriteSupport(
          ParquetUtils.createOutputFields( ParquetSpec.DataType.INT_64 ) );
      org.apache.parquet.hadoop.ParquetOutputFormat apacheNativeParquetOutputFormat =
        new org.apache.parquet.hadoop.ParquetOutputFormat<>( apacheSupport );
      org.apache.parquet.hadoop.ParquetRecordWriter<RowMetaAndData> apacheRecordWriter =
        (org.apache.parquet.hadoop.ParquetRecordWriter<RowMetaAndData>) apacheNativeParquetOutputFormat
          .getRecordWriter( task );
      recordWriterObject = apacheRecordWriter;
      writer = new org.pentaho.hadoop.shim.common.format.parquet.delegate.apache.PentahoParquetRecordWriter(
        apacheRecordWriter, task );
      break;
    case "TWITTER":
      parquet.hadoop.api.WriteSupport twitterSupport =
        new org.pentaho.hadoop.shim.common.format.parquet.delegate.twitter.PentahoParquetWriteSupport(
          ParquetUtils.createOutputFields( ParquetSpec.DataType.INT_64 ) );
      parquet.hadoop.ParquetOutputFormat twitterNativeParquetOutputFormat =
        new parquet.hadoop.ParquetOutputFormat<>( twitterSupport );
      parquet.hadoop.ParquetRecordWriter<RowMetaAndData> twitterRecordWriter =
        (parquet.hadoop.ParquetRecordWriter<RowMetaAndData>) twitterNativeParquetOutputFormat.getRecordWriter( task );
      recordWriterObject = twitterRecordWriter;
      writer = new org.pentaho.hadoop.shim.common.format.parquet.delegate.twitter.PentahoParquetRecordWriter(
        twitterRecordWriter, task );
      break;
    default:
      org.junit.Assert.fail( "Invalid provider name used." );
  }

  RowMetaAndData row = new RowMetaAndData();
  RowMeta rowMeta = new RowMeta();
  rowMeta.addValueMeta( new ValueMetaString( "Name" ) );
  rowMeta.addValueMeta( new ValueMetaString( "Age" ) );
  row.setRowMeta( rowMeta );

  switch ( testType ) {
    case "DATA":
      row.setData( new Object[] { "Alex", "87" } );
      break;
    case "NULL":
      row.setData( new Object[] { null, null } );
      break;
    default:
      org.junit.Assert.fail( "Invalid test type used." );
  }

  writer.write( row );

  switch ( provider ) {
    case "APACHE":
      ( (org.apache.parquet.hadoop.ParquetRecordWriter<RowMetaAndData>) recordWriterObject ).close( task );
      break;
    case "TWITTER":
      ( (parquet.hadoop.ParquetRecordWriter<RowMetaAndData>) recordWriterObject ).close( task );
      break;
    default:
      org.junit.Assert.fail( "Invalid provider name used." );
  }

  Files.walk( Paths.get( tempFile.toString() ) )
    .filter( Files::isRegularFile )
    .forEach( ( f ) -> {
      String file = f.toString();
      if ( file.endsWith( "parquet" ) ) {
        try {
          switch ( testType ) {
            case "DATA":
              IPentahoInputFormat.IPentahoRecordReader recordReader =
                readCreatedParquetFile( Paths.get( file ).toUri().toString() );
              recordReader.forEach(
                rowMetaAndData -> Assert.assertTrue( rowMetaAndData.size() == 2 ) );
              break;
            case "NULL":
              Assert.assertTrue( Files.size( Paths.get( file ) ) > 0 );
              break;
            default:
              org.junit.Assert.fail( "Invalid test type used." );
          }
        } catch ( Exception e ) {
          e.printStackTrace();
        }
      }
    } );
}