Java Code Examples for org.apache.parquet.example.data.Group#getFieldRepetitionCount()

The following examples show how to use org.apache.parquet.example.data.Group#getFieldRepetitionCount() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SparkModelParser.java From ignite with Apache License 2.0

6 votes

/**
 * Prints the given group in the row of Parquet file.
 *
 * @param g The given group.
 */
private static void printGroup(Group g) {
    int fieldCnt = g.getType().getFieldCount();
    for (int field = 0; field < fieldCnt; field++) {
        int valCnt = g.getFieldRepetitionCount(field);

        Type fieldType = g.getType().getType(field);
        String fieldName = fieldType.getName();

        for (int idx = 0; idx < valCnt; idx++) {
            if (fieldType.isPrimitive())
                System.out.println(fieldName + " " + g.getValueToString(field, idx));
            else
                printGroup(g.getGroup(field, idx));
        }
    }
    System.out.println();
}

Example 2

Source File: ParquetResolver.java From pxf with Apache License 2.0

5 votes

private OneField resolvePrimitive(Group group, int columnIndex, Type type, int level) {

        OneField field = new OneField();
        // get type converter based on the primitive type
        ParquetTypeConverter converter = ParquetTypeConverter.from(type.asPrimitiveType());

        // determine how many values for the primitive are present in the column
        int repetitionCount = group.getFieldRepetitionCount(columnIndex);

        // at the top level (top field), non-repeated primitives will convert to typed OneField
        if (level == 0 && type.getRepetition() != REPEATED) {
            field.type = converter.getDataType(type).getOID();
            field.val = repetitionCount == 0 ? null : converter.getValue(group, columnIndex, 0, type);
        } else if (type.getRepetition() == REPEATED) {
            // repeated primitive at any level will convert into JSON
            ArrayNode jsonArray = mapper.createArrayNode();
            for (int repeatIndex = 0; repeatIndex < repetitionCount; repeatIndex++) {
                converter.addValueToJsonArray(group, columnIndex, repeatIndex, type, jsonArray);
            }
            // but will become a string only at top level
            if (level == 0) {
                field.type = DataType.TEXT.getOID();
                try {
                    field.val = mapper.writeValueAsString(jsonArray);
                } catch (Exception e) {
                    throw new RuntimeException("Failed to serialize repeated parquet type " + type.asPrimitiveType().getName(), e);
                }
            } else {
                // just return the array node within OneField container
                field.val = jsonArray;
            }
        } else {
            // level > 0 and type != REPEATED -- primitive type as a member of complex group -- NOT YET SUPPORTED
            throw new UnsupportedOperationException("Parquet complex type support is not yet available.");
        }
        return field;
    }

Example 3

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readSVMCoefficients(SimpleGroup g) {
    Vector coefficients;
    Group coeffGroup = g.getGroup(0, 0).getGroup(3, 0);

    final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}

Example 4

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Read coefficient matrix from parquet.
 *
 * @param g Coefficient group.
 * @return Vector of coefficients.
 */
private static Vector readLinRegCoefficients(SimpleGroup g) {
    Vector coefficients;
    Group coeffGroup = g.getGroup(1, 0).getGroup(3, 0);

    final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0);

    coefficients = new DenseVector(amountOfCoefficients);

    for (int j = 0; j < amountOfCoefficients; j++) {
        double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0);
        coefficients.set(j, coefficient);
    }
    return coefficients;
}

Example 5

Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0

5 votes

private static List<PhoneNumber> getPhoneNumbers(Group phoneNumbers) {
  if (phoneNumbers == null) {
    return null;
  }
  List<PhoneNumber> list = new ArrayList<>();
  for (int i = 0, n = phoneNumbers.getFieldRepetitionCount("phone"); i < n; ++i) {
    Group phone = phoneNumbers.getGroup("phone", i);
    list.add(new PhoneNumber(getLong(phone, "number"), getString(phone, "kind")));
  }
  return list;
}

Example 6

Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0

5 votes

private static boolean isNull(Group group, String field) {
  // Use null value if the field is not in the group schema
  if (!group.getType().containsField(field)) {
    return true;
  }
  int repetition = group.getFieldRepetitionCount(field);
  if (repetition == 0) {
    return true;
  } else if (repetition == 1) {
    return false;
  }
  throw new AssertionError("Invalid repetitionCount " + repetition + " for field " + field + " in group " + group);
}

Example 7

Source File: SparkModelParser.java From ignite with Apache License 2.0

4 votes

/**
 * Load K-Means model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment learningEnvironment
 */
private static Model loadKMeansModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector[] centers = null;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final int rows = (int)pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            centers = new DenseVector[rows];

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                // final int clusterIdx = g.getInteger(0, 0);

                Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);

                final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);

                centers[i] = new DenseVector(amountOfCoefficients);

                for (int j = 0; j < amountOfCoefficients; j++) {
                    double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
                    centers[i].set(j, coefficient);
                }
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new KMeansModel(centers, new EuclideanDistance());
}