org.apache.parquet.example.data.simple.SimpleGroup Java Examples

The following examples show how to use org.apache.parquet.example.data.simple.SimpleGroup. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ColumnSizeCommandTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}
 
Example #2
Source File: TestColumnSizeCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}
 
Example #3
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Setup
public void writeFile() throws IOException {
  WriteConfigurator writeConfigurator = getWriteConfigurator();
  file = new Path(
      Files.createTempFile("benchmark-filtering_" + characteristic + '_' + writeConfigurator + '_', ".parquet")
          .toAbsolutePath().toString());
  long[] data = generateData();
  characteristic.arrangeData(data);
  try (ParquetWriter<Group> writer = writeConfigurator.configureBuilder(ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .withRowGroupSize(Integer.MAX_VALUE) // Ensure to have one row-group per file only
      .withWriteMode(OVERWRITE))
      .build()) {
    for (long value : data) {
      Group group = new SimpleGroup(SCHEMA);
      group.add(0, value);
      group.add(1, Binary.fromString(dummyGenerator.nextString()));
      group.add(2, Binary.fromString(dummyGenerator.nextString()));
      group.add(3, Binary.fromString(dummyGenerator.nextString()));
      group.add(4, Binary.fromString(dummyGenerator.nextString()));
      group.add(5, Binary.fromString(dummyGenerator.nextString()));
      writer.write(group);
    }
  }
}
 
Example #4
Source File: ParquetAsTextInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 6 votes vote down vote up
@Override
public boolean next(Text key, Text value) throws IOException {

    if (eof)    // The case where there are no records at all (and first record is being read in constructor)
        return false;

    try {
        // Remember, that we've already read the first record
        if (!firstRecord) {
            if (!realReader.nextKeyValue())
                return false;   // eof

            SimpleGroup g = realReader.getCurrentValue();
            ls = groupToStrings(g);
        }
        else
            firstRecord = false;

        if (key != null) key.set(fetchKey());
        if (value != null) value.set(fetchValue());
        return true;
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}
 
Example #5
Source File: TestTupleRecordConsumer.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testArtSchema() throws ExecException, ParserException {

  String pigSchemaString =
          "DocId:long, " +
          "Links:(Backward:{(long)}, Forward:{(long)}), " +
          "Name:{(Language:{(Code:chararray,Country:chararray)}, Url:chararray)}";

  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  g.add("DocId", 1l);
  Group links = g.addGroup("Links");
  links.addGroup("Backward").addGroup("bag").add(0, 1l);
  links.addGroup("Forward").addGroup("bag").add(0, 1l);
  Group name = g.addGroup("Name").addGroup("bag");
  name.addGroup("Language").addGroup("bag").append("Code", "en").append("Country", "US");
  name.add("Url", "http://foo/bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}
 
Example #6
Source File: SparkModelParser.java    From ignite with Apache License 2.0 6 votes vote down vote up
/**
 * Form the node data according data in parquet row.
 *
 * @param g The given group presenting the node data from Spark DT model.
 */
@NotNull private static SparkModelParser.NodeData extractNodeDataFromParquetRow(SimpleGroup g) {
    NodeData nodeData = new NodeData();

    nodeData.id = g.getInteger(0, 0);
    nodeData.prediction = g.getDouble(1, 0);
    nodeData.leftChildId = g.getInteger(5, 0);
    nodeData.rightChildId = g.getInteger(6, 0);

    if (nodeData.leftChildId == -1 && nodeData.rightChildId == -1) {
        nodeData.featureIdx = -1;
        nodeData.threshold = -1;
        nodeData.isLeafNode = true;
    }
    else {
        final SimpleGroup splitGrp = (SimpleGroup)g.getGroup(7, 0);
        nodeData.featureIdx = splitGrp.getInteger(0, 0);
        nodeData.threshold = splitGrp.getGroup(1, 0).getGroup(0, 0).getDouble(0, 0);
    }
    return nodeData;
}
 
Example #7
Source File: ParquetFileLineFetcher.java    From hugegraph-loader with Apache License 2.0 6 votes vote down vote up
@Override
public Line fetch() {
    boolean needFetchNext = this.pages == null ||
                            this.currRowOffset >= this.pagesRowCount;
    // Read next row group
    if (needFetchNext && !this.fetchNextPage()) {
        return null;
    }

    int fieldSize = this.schema.getFields().size();
    Object[] values = new Object[fieldSize];
    SimpleGroup group = (SimpleGroup) this.recordReader.read();
    for (int fieldIndex = 0; fieldIndex < fieldSize; fieldIndex++) {
        values[fieldIndex] = group.getValueToString(fieldIndex, 0);
    }
    String rawLine = StringUtils.join(values, Constants.COMMA_STR);

    this.currRowOffset++;
    this.increaseOffset();
    /*
     * NOTE: parquet file actually corresponds to a table structure,
     * doesn't need to skip line or match header
     */
    return new Line(rawLine, this.source().header(), values);
}
 
Example #8
Source File: TestTupleRecordConsumer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBags() throws ExecException, ParserException {
  String pigSchemaString = "a: {(b: chararray)}";

  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  Group addGroup = g.addGroup("a");
  addGroup.addGroup("bag").append("b", "foo");
  addGroup.addGroup("bag").append("b", "bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}
 
Example #9
Source File: TestTupleRecordConsumer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testMaps() throws ExecException, ParserException {
      String pigSchemaString = "a: [(b: chararray)]";
  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  Group map = g.addGroup("a");
  map.addGroup("map").append("key", "foo").addGroup("value").append("b", "foo");
  map.addGroup("map").append("key", "bar").addGroup("value").append("b", "bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}
 
Example #10
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readCoefficients(SimpleGroup g) {
    Vector coefficients;
    final int amountOfCoefficients = g.getGroup(3, 0).getGroup(5, 0).getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = g.getGroup(3, 0).getGroup(5, 0).getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}
 
Example #11
Source File: ParquetAsJsonOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 5 votes vote down vote up
@Override
protected RecordWriter<Text, Text>
    createRecordWriter(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf job, String name, Progressable p)
        throws IOException {

    return new JsonRecordWriterWrapper(w, fs, job, name, p);
}
 
Example #12
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(ParquetWriter<Group> writer) throws IOException {
  for (int index = 0; index < recordCount; index++) {
    Group group = new SimpleGroup(super.schema);

    for (int column = 0, columnCnt = schema.getFieldCount(); column < columnCnt; ++column) {
      Type type = schema.getType(column);
      RandomValueGenerator<?> generator = randomGenerators.get(column);
      if (type.isRepetition(OPTIONAL) && generator.shouldGenerateNull()) {
        continue;
      }
      switch (type.asPrimitiveType().getPrimitiveTypeName()) {
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
      case INT96:
        group.append(type.getName(), ((RandomBinaryBase<?>) generator).nextBinaryValue());
        break;
      case INT32:
        group.append(type.getName(), (Integer) generator.nextValue());
        break;
      case INT64:
        group.append(type.getName(), (Long) generator.nextValue());
        break;
      case FLOAT:
        group.append(type.getName(), (Float) generator.nextValue());
        break;
      case DOUBLE:
        group.append(type.getName(), (Double) generator.nextValue());
        break;
      case BOOLEAN:
        group.append(type.getName(), (Boolean) generator.nextValue());
        break;
      }
    }
    writer.write(group);
  }
}
 
Example #13
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static SimpleGroup groupFromUser(User user) {
  SimpleGroup root = new SimpleGroup(schema);
  root.append("id", user.getId());

  if (user.getName() != null) {
    root.append("name", user.getName());
  }

  if (user.getPhoneNumbers() != null) {
    Group phoneNumbers = root.addGroup("phoneNumbers");
    for (PhoneNumber number : user.getPhoneNumbers()) {
      Group phone = phoneNumbers.addGroup("phone");
      phone.append("number", number.getNumber());
      if (number.getKind() != null) {
        phone.append("kind", number.getKind());
      }
    }
  }

  if (user.getLocation() != null) {
    Group location = root.addGroup("location");
    if (user.getLocation().getLon() != null) {
      location.append("lon", user.getLocation().getLon());
    }
    if (user.getLocation().getLat() != null) {
      location.append("lat", user.getLocation().getLat());
    }
  }
  return root;
}
 
Example #14
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
  // this test assumes statistics will be read
  Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));

  File testFile = temp.newFile();
  testFile.delete();

  writeSchema = "message example {\n" +
          "required binary content (UTF8);\n" +
          "}";

  Path path = new Path(testFile.toURI());

  MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
  Configuration configuration = new Configuration();
  configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());

  Group r1 = new SimpleGroup(schema);
  writer.write(r1);
  writer.close();

  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);

  // assert the statistics object is not empty
  org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
  assertFalse("is empty: " + stats, stats.isEmpty());
  // assert the number of nulls are correct for the first block
  assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}
 
Example #15
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(Configuration conf, Map<String, String> extraMeta, int numRecord, String prefix, String codec,
                                       ParquetProperties.WriterVersion writerVersion, int pageSize, TestDocs testDocs) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, BINARY, "Backward"),
      new PrimitiveType(REPEATED, BINARY, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file))
    .withConf(conf)
    .withWriterVersion(writerVersion)
    .withExtraMetaData(extraMeta)
    .withDictionaryEncoding("DocId", true)
    .withValidation(true)
    .enablePageWriteChecksum()
    .withPageSize(pageSize)
    .withCompressionCodec(CompressionCodecName.valueOf(codec));
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", testDocs.docId[i]);
      g.add("Name", testDocs.name[i]);
      g.add("Gender", testDocs.gender[i]);
      Group links = g.addGroup("Links");
      links.add(0, testDocs.linkBackward[i]);
      links.add(1, testDocs.linkForward[i]);
      writer.write(g);
    }
  }

  return file;
}
 
Example #16
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}
 
Example #17
Source File: ParquetAsTextInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 5 votes vote down vote up
public TextRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat,
                           InputSplit oldSplit,
                           JobConf oldJobConf,
                           Reporter reporter) throws IOException {

    splitLen = oldSplit.getLength();

    try {
        ReadSupport<SimpleGroup> rs = ParquetInputFormat.getReadSupportInstance(oldJobConf);
        realReader = new ParquetRecordReader<>(rs);
        realReader.initialize(((StreamingParquetInputSplitWrapper)oldSplit).realSplit, oldJobConf, reporter);

        oldJobConf.set("map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());
        oldJobConf.set("mapreduce.map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());

        // read once to gain access to key and value objects
        if (realReader.nextKeyValue()) {

          firstRecord = true;
          valueContainer = new Container<>();
          SimpleGroup v = realReader.getCurrentValue();
          valueContainer.set(v);
          ls = groupToStrings(v);
        } else {

          eof = true;
        }
    } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
    }
}
 
Example #18
Source File: TestConstants.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()));
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}
 
Example #19
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Read interceptor value from parquet.
 *
 * @param g Interceptor group.
 */
private static double readInterceptor(SimpleGroup g) {
    double interceptor;

    final SimpleGroup interceptVector = (SimpleGroup)g.getGroup(2, 0);
    final SimpleGroup interceptVectorVal = (SimpleGroup)interceptVector.getGroup(3, 0);
    final SimpleGroup interceptVectorValElement = (SimpleGroup)interceptVectorVal.getGroup(0, 0);

    interceptor = interceptVectorValElement.getDouble(0, 0);

    return interceptor;
}
 
Example #20
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readLinRegCoefficients(SimpleGroup g) {
    Vector coefficients;
    Group coeffGroup = g.getGroup(1, 0).getGroup(3, 0);

    final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}
 
Example #21
Source File: ParquetAsTextInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 5 votes vote down vote up
protected List<String> groupToStrings(SimpleGroup grp) {

            ArrayList<String> s = new ArrayList<>();

            for (int n = 0; n < grp.getType().getFieldCount(); n ++) {

                Type field = grp.getType().getType(n);
                    try {
                        if (!field.isPrimitive())
                           s.addAll(groupToStrings((SimpleGroup) grp.getGroup(n, 0))); // array of groups not (yet) supported
                        else if (field.getRepetition() == Type.Repetition.REPEATED) {

                            boolean is_binary =
                                field.asPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY;
                            StringBuilder sb = new StringBuilder("[");
                            ArrayList<String> arr = new ArrayList<>();
                            for (int i = 0; i < grp.getFieldRepetitionCount(n); i ++)
                                arr.add(is_binary ? "\"" + grp.getValueToString(n, i) + "\"" :
                                    grp.getValueToString(n, i));

                            sb.append(Joiner.on(", ").join(arr));
                            sb.append("]");
                            s.add(sb.toString());
                        }
                        else
                            s.add(grp.getValueToString(n, 0));
                    }
                    catch (RuntimeException e) {
                        if(e.getMessage().startsWith("not found") && field.getRepetition() == Type.Repetition.OPTIONAL)
                            s.add("");
                        else
                            throw e;
                    }
            }

            return s;
        }
 
Example #22
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readSVMCoefficients(SimpleGroup g) {
    Vector coefficients;
    Group coeffGroup = g.getGroup(0, 0).getGroup(3, 0);

    final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}
 
Example #23
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}
 
Example #24
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LinearRegressionModel(coefficients, interceptor);
}
 
Example #25
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinearSVMModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readSVMInterceptor(g);
                coefficients = readSVMCoefficients(g);
            }
        }
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new SVMLinearClassificationModel(coefficients, interceptor);
}
 
Example #26
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
 
Example #27
Source File: ParquetAsJsonInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
private void groupToJson(JsonGenerator currentGenerator, SimpleGroup grp)
      throws IOException {

    GroupType gt = grp.getType();

    currentGenerator.writeStartObject();
    for(int i = 0; i < gt.getFieldCount(); i ++) {

        String field = gt.getFieldName(i);
        try {
            Type t = gt.getType(i);
            int repetition = 1;
            boolean repeated = false;
            if (t.getRepetition() == Type.Repetition.REPEATED) {
                repeated = true;
                repetition = grp.getFieldRepetitionCount(i);
                currentGenerator.writeArrayFieldStart(field);
            }
            else
                currentGenerator.writeFieldName(field);

            for(int j = 0; j < repetition; j ++) {

                if (t.isPrimitive()) {
                    switch (t.asPrimitiveType().getPrimitiveTypeName()) {
                        case BINARY:
                            currentGenerator.writeString(grp.getString(i, j));
                            break;
                        case INT32:
                            currentGenerator.writeNumber(grp.getInteger(i, j));
                            break;
                        case INT96:
                        case INT64:
                            // clumsy way - TODO - Subclass SimpleGroup or something like that
                            currentGenerator.writeNumber(Long.parseLong(grp.getValueToString(i, j)));
                            break;
                        case DOUBLE:
                        case FLOAT:
                            currentGenerator.writeNumber(Double.parseDouble(grp.getValueToString(i, j)));
                            break;
                        case BOOLEAN:
                            currentGenerator.writeBoolean(grp.getBoolean(i, j));
                            break;
                        default:
                            throw new RuntimeException("Can't handle type " + gt.getType(i));
                    }
                } else {
                    groupToJson(currentGenerator, (SimpleGroup) grp.getGroup(i, j));
                }
            }

            if (repeated)
                currentGenerator.writeEndArray();
        }
        catch (Exception e) {
            if (e.getMessage().startsWith("not found") && gt.getType(i).getRepetition() == Type.Repetition.OPTIONAL)
                currentGenerator.writeNull();
            else
                 throw new RuntimeException(e);
        }
    }
    currentGenerator.writeEndObject();
}
 
Example #28
Source File: ParquetAsJsonInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public JsonRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat, InputSplit oldSplit,
        JobConf oldJobConf, Reporter reporter) throws IOException {
    super(newInputFormat, oldSplit, oldJobConf, reporter);
}
 
Example #29
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 4 votes vote down vote up
@Test
public void testGetFields_Primitive_Repeated_Synthetic() {
    // this test does not read the actual Parquet file, but rather construct Group object synthetically
    schema = getParquetSchemaForPrimitiveTypes(Type.Repetition.REPEATED, true);
    // schema has changed, set metadata again
    context.setMetadata(schema);
    context.setTupleDescription(getColumnDescriptorsFromSchema(schema));
    resolver.initialize(context);

    /*
    Corresponding DB column types  are:
    TEXT,TEXT,INTEGER, DOUBLE PRECISION,NUMERIC,TIMESTAMP,REAL,BIGINT,BOOLEAN,SMALLINT,SMALLINT,VARCHAR(5),CHAR(3),BYTEA
     */

    Group group = new SimpleGroup(schema);

    group.add(0, "row1-1");
    group.add(0, "row1-2");

    // leave column 1 (t2) unset as part fo the test

    group.add(2, 1);
    group.add(2, 2);
    group.add(2, 3);

    group.add(3, 6.0d);
    group.add(3, -16.34d);

    BigDecimal value = new BigDecimal("12345678.9012345987654321"); // place of dot doesn't matter
    byte fillByte = (byte) (value.signum() < 0 ? 0xFF : 0x00);
    byte[] unscaled = value.unscaledValue().toByteArray();
    byte[] bytes = new byte[16];
    int offset = bytes.length - unscaled.length;
    for (int i = 0; i < bytes.length; i += 1) {
        bytes[i] = (i < offset) ? fillByte : unscaled[i - offset];
    }
    group.add(4, Binary.fromReusedByteArray(bytes));

    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("2019-03-14 14:10:28"));
    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("1969-12-30 05:42:23.211211"));

    group.add(6, 7.7f);
    group.add(6, -12345.35354646f);

    group.add(7, 23456789L);
    group.add(7, -123456789012345L);

    group.add(8, true);
    group.add(8, false);

    group.add(9, (short) 1);
    group.add(9, (short) -3);

    group.add(10, (short) 269);
    group.add(10, (short) -313);

    group.add(11, Binary.fromString("Hello"));
    group.add(11, Binary.fromString("World"));

    group.add(12, Binary.fromString("foo"));
    group.add(12, Binary.fromString("bar"));

    byte[] byteArray1 = new byte[]{(byte) 49, (byte) 50, (byte) 51};
    group.add(13, Binary.fromReusedByteArray(byteArray1, 0, 3));
    byte[] byteArray2 = new byte[]{(byte) 52, (byte) 53, (byte) 54};
    group.add(13, Binary.fromReusedByteArray(byteArray2, 0, 3));

    group.add(14, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28+07"));
    OffsetDateTime offsetDateTime1 = OffsetDateTime.parse("2019-03-14T14:10:28+07:00");
    ZonedDateTime localDateTime1 = offsetDateTime1.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString1 = localDateTime1.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));

    group.add(15, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28-07:30"));
    OffsetDateTime offsetDateTime2 = OffsetDateTime.parse("2019-03-14T14:10:28-07:30");
    ZonedDateTime localDateTime2 = offsetDateTime2.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString2 = localDateTime2.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));


    List<Group> groups = new ArrayList<>();
    groups.add(group);
    List<OneField> fields = assertRow(groups, 0, 16);

    assertField(fields, 0, "[\"row1-1\",\"row1-2\"]", DataType.TEXT);
    assertField(fields, 1, "[]", DataType.TEXT);
    assertField(fields, 2, "[1,2,3]", DataType.TEXT);
    assertField(fields, 3, "[6.0,-16.34]", DataType.TEXT);
    assertField(fields, 4, "[123456.789012345987654321]", DataType.TEXT); // scale fixed to 18 in schema
    assertField(fields, 5, "[\"2019-03-14 14:10:28\",\"1969-12-30 05:42:23.211211\"]", DataType.TEXT);
    assertField(fields, 6, "[7.7,-12345.354]", DataType.TEXT); // rounded to the precision of 8
    assertField(fields, 7, "[23456789,-123456789012345]", DataType.TEXT);
    assertField(fields, 8, "[true,false]", DataType.TEXT);
    assertField(fields, 9, "[1,-3]", DataType.TEXT);
    assertField(fields, 10, "[269,-313]", DataType.TEXT);
    assertField(fields, 11, "[\"Hello\",\"World\"]", DataType.TEXT);
    assertField(fields, 12, "[\"foo\",\"bar\"]", DataType.TEXT); // 3 chars only
    Base64.Encoder encoder = Base64.getEncoder(); // byte arrays are Base64 encoded into strings
    String expectedByteArrays = "[\"" + encoder.encodeToString(byteArray1) + "\",\"" + encoder.encodeToString(byteArray2) + "\"]";
    assertField(fields, 13, expectedByteArrays, DataType.TEXT);
    assertField(fields, 14, "[\"" + localDateTimeString1 + "\"]", DataType.TEXT);
    assertField(fields, 15, "[\"" + localDateTimeString2 + "\"]", DataType.TEXT);
}
 
Example #30
Source File: ParquetAsTextOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
protected RecordWriter<Text, Text>
    createRecordWriter(ParquetRecordWriter<SimpleGroup> w, FileSystem fs, JobConf job, String name, Progressable p)
        throws IOException {

    return new TextRecordWriterWrapper(w, fs, job, name, p);
}