parquet.column.ColumnDescriptor Java Examples

The following examples show how to use parquet.column.ColumnDescriptor. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReader.java    From paraflow with Apache License 2.0 6 votes vote down vote up
public int nextBatch()
{
    if (nextRowInGroup >= currentGroupRowCount && !advanceToNextRowGroup()) {
        return -1;
    }

    batchSize = checkedCast(min(MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup));

    nextRowInGroup += batchSize;
    currentPosition += batchSize;
    for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
        ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
        RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
        ParquetColumnReader columnReader = columnReadersMap.get(column);
        columnReader.prepareNextRead(batchSize);
    }
    return batchSize;
}
 
Example #2
Source File: ParquetReader.java    From paraflow with Apache License 2.0 6 votes vote down vote up
public Block readPrimitive(ColumnDescriptor columnDescriptor, Type type, IntList offsets)
        throws IOException
{
    ParquetColumnReader columnReader = columnReadersMap.get(columnDescriptor);
    if (columnReader.getPageReader() == null) {
        validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
        ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
        long startingPosition = metadata.getStartingPos();
        int totalSize = checkedCast(metadata.getTotalSize());
        byte[] buffer = new byte[totalSize];
        dataSource.readFully(startingPosition, buffer);
        ParquetColumnChunkDescriptor descriptor = new ParquetColumnChunkDescriptor(columnDescriptor, metadata, totalSize);
        ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0);
        columnReader.setPageReader(columnChunk.readAllPages());
    }
    return columnReader.readPrimitive(type, offsets);
}
 
Example #3
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 6 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  OriginalType otype = type.getOriginalType();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (otype != null) out.format(" O:%s", otype);

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[cpath.size()]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example #4
Source File: ParquetReader.java    From paraflow with Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData getColumnChunkMetaData(ColumnDescriptor columnDescriptor)
        throws IOException
{
    for (ColumnChunkMetaData metadata : currentBlockMetadata.getColumns()) {
        if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
            return metadata;
        }
    }
    throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor);
}
 
Example #5
Source File: ParquetReader.java    From paraflow with Apache License 2.0 5 votes vote down vote up
private void initializeColumnReaders()
{
    for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
        ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
        RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
        columnReadersMap.put(column, ParquetColumnReader.createReader(column));
    }
}
 
Example #6
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, ColumnDescriptor desc) {
  String path = Joiner.on(".").skipNulls().join(desc.getPath());
  PrimitiveTypeName type = desc.getType();
  int defl = desc.getMaxDefinitionLevel();
  int repl = desc.getMaxRepetitionLevel();

  out.format("column desc: %s T:%s R:%d D:%d%n", path, type, repl, defl); 
}
 
Example #7
Source File: DumpCommand.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
public static void dump(PrettyPrintWriter out, PageReadStore store, ColumnDescriptor column) throws IOException {
    PageReader reader = store.getPageReader(column);

    long vc = reader.getTotalValueCount();
    int rmax = column.getMaxRepetitionLevel();
    int dmax = column.getMaxDefinitionLevel();
    out.format("%s TV=%d RL=%d DL=%d", Joiner.on('.').skipNulls().join(column.getPath()), vc, rmax, dmax);

    DictionaryPage dict = reader.readDictionaryPage();
    if (dict != null) {
        out.format(" DS:%d", dict.getDictionarySize());
        out.format(" DE:%s", dict.getEncoding());
    }

    out.println();
    out.rule('-');

    Page page = reader.readPage();
    for (long count = 0; page != null; count++) {
        out.format("page %d:", count);
        out.format(" DLE:%s", page.getDlEncoding());
        out.format(" RLE:%s", page.getRlEncoding());
        out.format(" VLE:%s", page.getValueEncoding());
        out.format(" SZ:%d", page.getUncompressedSize());
        out.format(" VC:%d", page.getValueCount());
        out.println();
        page = reader.readPage();
    }
}
 
Example #8
Source File: DumpCommand.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
public static void dump(PrettyPrintWriter out, ColumnReadStoreImpl crstore, ColumnDescriptor column, long page, long total, long offset) throws IOException {
    int dmax = column.getMaxDefinitionLevel();
    ColumnReader creader = crstore.getColumnReader(column);
    out.format("*** row group %d of %d, values %d to %d ***%n", page, total, offset, offset + creader.getTotalValueCount() - 1);

    for (long i = 0, e = creader.getTotalValueCount(); i < e; ++i) {
        int rlvl = creader.getCurrentDefinitionLevel();
        int dlvl = creader.getCurrentDefinitionLevel();

        out.format("value %d: R:%d D:%d V:", offset+i, rlvl, dlvl);
        if (dlvl == dmax) {
            switch (column.getType()) {
            case BINARY:  out.format("%s", binaryToString(creader.getBinary())); break;
            case BOOLEAN: out.format("%s", creader.getBoolean()); break;
            case DOUBLE:  out.format("%s", creader.getDouble()); break;
            case FLOAT:   out.format("%s", creader.getFloat()); break;
            case INT32:   out.format("%s", creader.getInteger()); break;
            case INT64:   out.format("%s", creader.getLong()); break;
            case INT96:   out.format("%s", binaryToBigInteger(creader.getBinary())); break;
            case FIXED_LEN_BYTE_ARRAY: out.format("%s", binaryToString(creader.getBinary())); break;
            }
        } else {
            out.format("<null>");
        }

        out.println();
        creader.consume();
    }
}
 
Example #9
Source File: ParquetReader.java    From paraflow with Apache License 2.0 4 votes vote down vote up
public Block readPrimitive(ColumnDescriptor columnDescriptor, Type type)
        throws IOException
{
    return readPrimitive(columnDescriptor, type, new IntArrayList());
}
 
Example #10
Source File: ParaflowPageSource.java    From paraflow with Apache License 2.0 4 votes vote down vote up
public ParquetBlockLoader(ColumnDescriptor columnDescriptor, Type type)
{
    this.columnDescriptor = columnDescriptor;
    this.type = requireNonNull(type, "type is null");
}
 
Example #11
Source File: LocalParquetEvaluator.java    From rainbow with Apache License 2.0 4 votes vote down vote up
public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
    boolean printColumns = true;
    List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
    List<Column> columns = new ArrayList<Column>();
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        FileStatus status = fileStatuses[i];
        ParquetMetadata metadata = metadatas[i];

        MessageType schema = metadata.getFileMetaData().getSchema();

        List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();

        for (String columnName : columnNames)
        {
            int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
            ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);

            columnDescriptors.add(descriptor);

            if (printColumns)
            {
                Column column = new Column();
                column.setIndex(fieldIndex);
                column.setName(schema.getFieldName(column.getIndex()));
                column.setDescriptor(descriptor);
                columns.add(column);
            }
        }
        printColumns = false;

        readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
    }

    long time  = System.currentTimeMillis();
    long rowCount = 0;
    long rowGroupCount = 0;
    long readerCount = readers.size();
    for (ParquetFileReader reader : readers)
    {
        PageReadStore pageReadStore;
        while ((pageReadStore = reader.readNextRowGroup()) != null)
        {
            rowGroupCount ++;
            rowCount += pageReadStore.getRowCount();
        }
        reader.close();
    }
    LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
    return metrics;
}
 
Example #12
Source File: Column.java    From rainbow with Apache License 2.0 4 votes vote down vote up
public ColumnDescriptor getDescriptor()
{
    return descriptor;
}
 
Example #13
Source File: Column.java    From rainbow with Apache License 2.0 4 votes vote down vote up
public void setDescriptor(ColumnDescriptor descriptor)
{
    this.descriptor = descriptor;
}