Java Code Examples for parquet.column.ColumnDescriptor

The following examples show how to use parquet.column.ColumnDescriptor. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: paraflow   Source File: ParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
public int nextBatch()
{
    if (nextRowInGroup >= currentGroupRowCount && !advanceToNextRowGroup()) {
        return -1;
    }

    batchSize = checkedCast(min(MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup));

    nextRowInGroup += batchSize;
    currentPosition += batchSize;
    for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
        ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
        RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
        ParquetColumnReader columnReader = columnReadersMap.get(column);
        columnReader.prepareNextRead(batchSize);
    }
    return batchSize;
}
 
Example 2
Source Project: paraflow   Source File: ParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
public Block readPrimitive(ColumnDescriptor columnDescriptor, Type type, IntList offsets)
        throws IOException
{
    ParquetColumnReader columnReader = columnReadersMap.get(columnDescriptor);
    if (columnReader.getPageReader() == null) {
        validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
        ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
        long startingPosition = metadata.getStartingPos();
        int totalSize = checkedCast(metadata.getTotalSize());
        byte[] buffer = new byte[totalSize];
        dataSource.readFully(startingPosition, buffer);
        ParquetColumnChunkDescriptor descriptor = new ParquetColumnChunkDescriptor(columnDescriptor, metadata, totalSize);
        ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0);
        columnReader.setPageReader(columnChunk.readAllPages());
    }
    return columnReader.readPrimitive(type, offsets);
}
 
Example 3
Source Project: parquet-tools   Source File: MetadataUtils.java    License: Apache License 2.0 6 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  OriginalType otype = type.getOriginalType();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (otype != null) out.format(" O:%s", otype);

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[cpath.size()]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example 4
Source Project: paraflow   Source File: ParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData getColumnChunkMetaData(ColumnDescriptor columnDescriptor)
        throws IOException
{
    for (ColumnChunkMetaData metadata : currentBlockMetadata.getColumns()) {
        if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
            return metadata;
        }
    }
    throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor);
}
 
Example 5
Source Project: paraflow   Source File: ParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
private void initializeColumnReaders()
{
    for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
        ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
        RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
        columnReadersMap.put(column, ParquetColumnReader.createReader(column));
    }
}
 
Example 6
Source Project: parquet-tools   Source File: MetadataUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, ColumnDescriptor desc) {
  String path = Joiner.on(".").skipNulls().join(desc.getPath());
  PrimitiveTypeName type = desc.getType();
  int defl = desc.getMaxDefinitionLevel();
  int repl = desc.getMaxRepetitionLevel();

  out.format("column desc: %s T:%s R:%d D:%d%n", path, type, repl, defl); 
}
 
Example 7
Source Project: parquet-tools   Source File: DumpCommand.java    License: Apache License 2.0 5 votes vote down vote up
public static void dump(PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException {
    PageReader reader = store.getPageReader(column);

    long vc = reader.getTotalValueCount();
    int rmax = column.getMaxRepetitionLevel();
    int dmax = column.getMaxDefinitionLevel();
    out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax);

    DictionaryPage dict = reader.readDictionaryPage();
    if (dict != null) {
        out.format(" DS:%d", dict.getDictionarySize());
        out.format(" DE:%s", dict.getEncoding());
    }

    out.println();
    out.rule('-');

    Page page = reader.readPage();
    for (long count = 0; page != null; count++) {
        out.format("page %d:", count);
        out.format(" DLE:%s", page.getDlEncoding());
        out.format(" RLE:%s", page.getRlEncoding());
        out.format(" VLE:%s", page.getValueEncoding());
        out.format(" SZ:%d", page.getUncompressedSize());
        out.format(" VC:%d", page.getValueCount());
        out.println();
        page = reader.readPage();
    }
}
 
Example 8
Source Project: parquet-tools   Source File: DumpCommand.java    License: Apache License 2.0 5 votes vote down vote up
public static void dump(PrettyPrintWriter out, ColumnReadStoreImpl crstore, ColumnDescriptor column, long page, long total, long offset) throws IOException {
    int dmax = column.getMaxDefinitionLevel();
    ColumnReader creader = crstore.getColumnReader(column);
    out.format("*** row group %d of %d, values %d to %d ***%n", page, total, offset, offset + creader.getTotalValueCount() - 1);

    for (long i = 0, e = creader.getTotalValueCount(); i < e; ++i) {
        int rlvl = creader.getCurrentDefinitionLevel();
        int dlvl = creader.getCurrentDefinitionLevel();

        out.format("value %d: R:%d D:%d V:", offset+i, rlvl, dlvl);
        if (dlvl == dmax) {
            switch (column.getType()) {
            case BINARY:  out.format("%s", binaryToString(creader.getBinary())); break;
            case BOOLEAN: out.format("%s", creader.getBoolean()); break;
            case DOUBLE:  out.format("%s", creader.getDouble()); break;
            case FLOAT:   out.format("%s", creader.getFloat()); break;
            case INT32:   out.format("%s", creader.getInteger()); break;
            case INT64:   out.format("%s", creader.getLong()); break;
            case INT96:   out.format("%s", binaryToBigInteger(creader.getBinary())); break;
            case FIXED_LEN_BYTE_ARRAY: out.format("%s", binaryToString(creader.getBinary())); break;
            }
        } else {
            out.format("<null>");
        }

        out.println();
        creader.consume();
    }
}
 
Example 9
Source Project: paraflow   Source File: ParquetReader.java    License: Apache License 2.0 4 votes vote down vote up
public Block readPrimitive(ColumnDescriptor columnDescriptor, Type type)
        throws IOException
{
    return readPrimitive(columnDescriptor, type, new IntArrayList());
}
 
Example 10
Source Project: paraflow   Source File: ParaflowPageSource.java    License: Apache License 2.0 4 votes vote down vote up
public ParquetBlockLoader(ColumnDescriptor columnDescriptor, Type type)
{
    this.columnDescriptor = columnDescriptor;
    this.type = requireNonNull(type, "type is null");
}
 
Example 11
Source Project: rainbow   Source File: LocalParquetEvaluator.java    License: Apache License 2.0 4 votes vote down vote up
public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
    boolean printColumns = true;
    List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
    List<Column> columns = new ArrayList<Column>();
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        FileStatus status = fileStatuses[i];
        ParquetMetadata metadata = metadatas[i];

        MessageType schema = metadata.getFileMetaData().getSchema();

        List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();

        for (String columnName : columnNames)
        {
            int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
            ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);

            columnDescriptors.add(descriptor);

            if (printColumns)
            {
                Column column = new Column();
                column.setIndex(fieldIndex);
                column.setName(schema.getFieldName(column.getIndex()));
                column.setDescriptor(descriptor);
                columns.add(column);
            }
        }
        printColumns = false;

        readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
    }

    long time  = System.currentTimeMillis();
    long rowCount = 0;
    long rowGroupCount = 0;
    long readerCount = readers.size();
    for (ParquetFileReader reader : readers)
    {
        PageReadStore pageReadStore;
        while ((pageReadStore = reader.readNextRowGroup()) != null)
        {
            rowGroupCount ++;
            rowCount += pageReadStore.getRowCount();
        }
        reader.close();
    }
    LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
    return metrics;
}
 
Example 12
Source Project: rainbow   Source File: Column.java    License: Apache License 2.0 4 votes vote down vote up
public ColumnDescriptor getDescriptor()
{
    return descriptor;
}
 
Example 13
Source Project: rainbow   Source File: Column.java    License: Apache License 2.0 4 votes vote down vote up
public void setDescriptor(ColumnDescriptor descriptor)
{
    this.descriptor = descriptor;
}