org.apache.avro.file.SeekableInput Java Examples
The following examples show how to use
org.apache.avro.file.SeekableInput.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroScanner.java From tajo with Apache License 2.0 | 6 votes |
/** * Initializes the AvroScanner. */ @Override public void init() throws IOException { if (targets == null) { targets = schema.toArray(); } prepareProjection(targets); outTuple = new VTuple(projectionMap.length); Schema avroSchema = AvroUtil.getAvroSchema(meta, conf); avroFields = avroSchema.getFields(); DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(avroSchema); SeekableInput input = new FsInput(fragment.getPath(), conf); dataFileReader = new DataFileReader<>(input, datumReader); super.init(); }
Example #2
Source File: AvroInputFormat.java From stratosphere with Apache License 2.0 | 6 votes |
@Override public void open(FileInputSplit split) throws IOException { super.open(split); DatumReader<E> datumReader; if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) { datumReader = new SpecificDatumReader<E>(avroValueType); } else { datumReader = new ReflectDatumReader<E>(avroValueType); } LOG.info("Opening split " + split); SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength()); dataFileReader = DataFileReader.openReader(in, datumReader); dataFileReader.sync(split.getStart()); }
Example #3
Source File: AvroInputFormat.java From stratosphere with Apache License 2.0 | 6 votes |
@Override public void open(FileInputSplit split) throws IOException { super.open(split); this.wrapper = InstantiationUtil.instantiate(avroWrapperTypeClass, AvroBaseValue.class); DatumReader<E> datumReader; if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) { datumReader = new SpecificDatumReader<E>(avroValueType); } else { datumReader = new ReflectDatumReader<E>(avroValueType); } LOG.info("Opening split " + split); SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength()); dataFileReader = DataFileReader.openReader(in, datumReader); dataFileReader.sync(split.getStart()); reuseAvroValue = null; }
Example #4
Source File: SchemaCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
private String getParquetSchema(String source) throws IOException { Formats.Format format; try (SeekableInput in = openSeekable(source)) { format = Formats.detectFormat((InputStream) in); in.seek(0); switch (format) { case PARQUET: return new ParquetFileReader( getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER) .getFileMetaData().getSchema().toString(); default: throw new IllegalArgumentException(String.format( "Could not get a Parquet schema for format %s: %s", format, source)); } } }
Example #5
Source File: BaseCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
protected Schema getAvroSchema(String source) throws IOException { Formats.Format format; try (SeekableInput in = openSeekable(source)) { format = Formats.detectFormat((InputStream) in); in.seek(0); switch (format) { case PARQUET: return Schemas.fromParquet(getConf(), qualifiedURI(source)); case AVRO: return Schemas.fromAvro(open(source)); case TEXT: if (source.endsWith("avsc")) { return Schemas.fromAvsc(open(source)); } else if (source.endsWith("json")) { return Schemas.fromJSON("json", open(source)); } default: } throw new IllegalArgumentException(String.format( "Could not determine file format of %s.", source)); } }
Example #6
Source File: AvroInputFormat.java From flink with Apache License 2.0 | 6 votes |
private DataFileReader<E> initReader(FileInputSplit split) throws IOException { DatumReader<E> datumReader; if (org.apache.avro.generic.GenericRecord.class == avroValueType) { datumReader = new GenericDatumReader<E>(); } else { datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType); } if (LOG.isInfoEnabled()) { LOG.info("Opening split {}", split); } SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen()); DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader); if (LOG.isDebugEnabled()) { LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema()); } end = split.getStart() + split.getLength(); recordsReadSinceLastSync = 0; return dataFileReader; }
Example #7
Source File: AvroRecordWriterTest.java From data-highway with Apache License 2.0 | 6 votes |
@Test public void typical() throws Exception { Schema schema = SchemaBuilder .builder() .record("record") .fields() .requiredLong("id") .requiredString("name") .endRecord(); Record value = new GenericRecordBuilder(schema).set("id", 1L).set("name", "hello").build(); ByteArrayOutputStream output = new ByteArrayOutputStream(); Factory factory = new Factory(CodecFactory.nullCodec()); RecordWriter writer = factory.create(schema, output); writer.write(value); writer.close(); SeekableInput input = new SeekableByteArrayInput(output.toByteArray()); DatumReader<Record> datumReader = new GenericDatumReader<>(schema); DataFileReader<Record> dataFileReader = new DataFileReader<>(input, datumReader); assertThat(dataFileReader.next(), is(value)); assertThat(dataFileReader.hasNext(), is(false)); dataFileReader.close(); }
Example #8
Source File: Purge.java From Cubert with Apache License 2.0 | 6 votes |
private DataFileReader<GenericRecord> createDataFileReader(String filename, boolean localFS) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> dataFileReader; if (localFS) { dataFileReader = new DataFileReader<GenericRecord>(new File(filename), datumReader); } else { Path path = new Path(filename); SeekableInput input = new FsInput(path, conf); dataFileReader = new DataFileReader<GenericRecord>(input, datumReader); } return dataFileReader; }
Example #9
Source File: AvroInputFormat.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
private DataFileReader<E> initReader(FileInputSplit split) throws IOException { DatumReader<E> datumReader; if (org.apache.avro.generic.GenericRecord.class == avroValueType) { datumReader = new GenericDatumReader<E>(); } else { datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType); } if (LOG.isInfoEnabled()) { LOG.info("Opening split {}", split); } SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen()); DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader); if (LOG.isDebugEnabled()) { LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema()); } end = split.getStart() + split.getLength(); recordsReadSinceLastSync = 0; return dataFileReader; }
Example #10
Source File: AvroInputFormat.java From flink with Apache License 2.0 | 6 votes |
private DataFileReader<E> initReader(FileInputSplit split) throws IOException { DatumReader<E> datumReader; if (org.apache.avro.generic.GenericRecord.class == avroValueType) { datumReader = new GenericDatumReader<E>(); } else { datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType); } if (LOG.isInfoEnabled()) { LOG.info("Opening split {}", split); } SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen()); DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader); if (LOG.isDebugEnabled()) { LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema()); } end = split.getStart() + split.getLength(); recordsReadSinceLastSync = 0; return dataFileReader; }
Example #11
Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0 | 6 votes |
public void convert(SeekableInput avroInputFile, Path orcOutputFile) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<>(); try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroInputFile, reader)) { Schema avroSchema = fileReader.getSchema(); initializeWriter(avroSchema, orcOutputFile); while (fileReader.hasNext()) { GenericRecord record = fileReader.next(); addAvroRecord(record); } closeWriter(); } }
Example #12
Source File: AvroUtilsTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
public static List<GenericRecord> getRecordFromFile(String path) throws IOException { Configuration config = new Configuration(); SeekableInput input = new FsInput(new Path(path), config); DatumReader<GenericRecord> reader1 = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1); List<GenericRecord> records = new ArrayList<>(); for (GenericRecord datum : fileReader) { records.add(datum); } fileReader.close(); return records; }
Example #13
Source File: AvroRecordInputFormat.java From stratosphere with Apache License 2.0 | 5 votes |
@Override public void open(FileInputSplit split) throws IOException { super.open(split); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength()); LOG.info("Opening split " + split); dataFileReader = DataFileReader.openReader(in, datumReader); dataFileReader.sync(split.getStart()); }
Example #14
Source File: AvroUtil.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
/** * Get the schema of AVRO files stored in a directory */ public static Schema getAvroSchema(Path path, Configuration conf) throws IOException { FileSystem fs = path.getFileSystem(conf); Path fileToTest; if (fs.isDirectory(path)) { FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }); if (fileStatuses.length == 0) { return null; } fileToTest = fileStatuses[0].getPath(); } else { fileToTest = path; } SeekableInput input = new FsInput(fileToTest, conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); Schema result = fileReader.getSchema(); fileReader.close(); return result; }
Example #15
Source File: AvroRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration conf = context.getConfiguration(); SeekableInput in = new FsInput(split.getPath(), conf); DatumReader<T> datumReader = new GenericDatumReader<T>(); this.reader = DataFileReader.openReader(in, datumReader); reader.sync(split.getStart()); // sync to start this.start = reader.tell(); this.end = split.getStart() + split.getLength(); }
Example #16
Source File: AvroTestTools.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Read all avro records in an HDFS location into a map from file name to {@link RecordIterator}. */ @Override public TreeMap<String, RecordIterator> readAllRecordsInBinaryDirectory(FileSystem fs, Path path) throws IOException { TreeMap<String, RecordIterator> output = new TreeMap<>(); if (!fs.exists(path)) { return output; } PathFilter pathFilter = new HiddenFilter(); for (FileStatus status : FileListUtils.listFilesRecursively(fs, path, pathFilter)) { SeekableInput sin = new FsInput(status.getPath(), fs); DataFileReader<GenericRecord> dfr = new DataFileReader<>(sin, new GenericDatumReader<>()); String key = PathUtils.relativizePath(status.getPath(), path).toString(); output.put(key, new RecordIterator(dfr.getSchema(), new AbstractIterator<GenericRecord>() { @Override protected GenericRecord computeNext() { if (dfr.hasNext()) { return dfr.next(); } else { try { dfr.close(); } catch (IOException ioe) { log.error("Failed to close data file reader.", ioe); } endOfData(); return null; } } })); } return output; }
Example #17
Source File: TestAvroExtractor.java From incubator-gobblin with Apache License 2.0 | 5 votes |
public static List<GenericRecord> getRecordFromFile(String path) throws IOException { Configuration config = new Configuration(); SeekableInput input = new FsInput(new Path(path), config); DatumReader<GenericRecord> reader1 = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1); List<GenericRecord> records = new ArrayList<>(); for (GenericRecord datum : fileReader) { records.add(datum); } fileReader.close(); return records; }
Example #18
Source File: AvroUtils.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } }
Example #19
Source File: AvroExternalTable.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private Schema getSchemaFromAvroDataFile() throws IOException { String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs); LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath); SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput(); try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) { Schema schema = dfr.getSchema(); return schema; } }
Example #20
Source File: TestMerge.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
private boolean checkAvroFileForLine(FileSystem fs, Path p, List<Integer> record) throws IOException { SeekableInput in = new FsInput(p, new Configuration()); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> reader = DataFileReader.openReader(in, datumReader); reader.sync(0); while (reader.hasNext()) { if (valueMatches(reader.next(), record)) { return true; } } return false; }
Example #21
Source File: AvroFileReader.java From streamx with Apache License 2.0 | 5 votes |
@Override public Schema getSchema(Configuration conf, Path path) throws IOException { SeekableInput input = new FsInput(path, conf); DatumReader<Object> reader = new GenericDatumReader<>(); FileReader<Object> fileReader = DataFileReader.openReader(input, reader); org.apache.avro.Schema schema = fileReader.getSchema(); fileReader.close(); return avroData.toConnectSchema(schema); }
Example #22
Source File: AvroUtils.java From Cubert with Apache License 2.0 | 5 votes |
public static Schema getSchema(SeekableInput input) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(input, datumReader); Schema schema = dataFileReader.getSchema(); if (PadDefaultNullsToSchema) { // a list of "cloned" fields, with optional default value set to null ArrayList<Field> paddedFields = new ArrayList<Field>(); for (Field field: schema.getFields()) { // should this field be padded? boolean needsNullPadding = (field.schema() != null) // the field has nested schema && (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION && (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue(); Field f = new Field(field.name(), field.schema(), field.doc(), defValue); paddedFields.add(f); } schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); schema.setFields(paddedFields); } return schema; }
Example #23
Source File: AvroFileReader.java From streamx with Apache License 2.0 | 5 votes |
@Override public Collection<Object> readData(Configuration conf, Path path) throws IOException { ArrayList<Object> collection = new ArrayList<>(); SeekableInput input = new FsInput(path, conf); DatumReader<Object> reader = new GenericDatumReader<>(); FileReader<Object> fileReader = DataFileReader.openReader(input, reader); for (Object object: fileReader) { collection.add(object); } fileReader.close(); return collection; }
Example #24
Source File: ClusterHdfsSource.java From datacollector with Apache License 2.0 | 5 votes |
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException { int previewCount = previewBuffer.size(); Path filePath = fileStatus.getPath(); SeekableInput input = new FsInput(filePath, hadoopConf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); List<Map.Entry> batch = new ArrayList<>(); try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader)) { int count = 0; while (fileReader.hasNext() && batch.size() < batchSize && previewCount < batchSize) { GenericRecord datum = fileReader.next(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord> (datum.getSchema())); try { dataFileWriter.create(datum.getSchema(), out); dataFileWriter.append(datum); } finally { dataFileWriter.close(); out.close(); } batch.add(new Pair(filePath.toUri().getPath() + "::" + count, out.toByteArray())); count++; previewCount++; } } return batch; }
Example #25
Source File: BaseCommand.java From parquet-mr with Apache License 2.0 | 4 votes |
public SeekableInput openSeekable(String filename) throws IOException { Path path = qualifiedPath(filename); // even though it was qualified using the default FS, it may not be in it FileSystem fs = path.getFileSystem(getConf()); return new SeekableFSDataInputStream(fs, path); }
Example #26
Source File: AvroConversionBaseMapper.java From datacollector with Apache License 2.0 | 4 votes |
@Override protected void map(String input, String output, Context context) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(context.getConfiguration()); Configuration conf = context.getConfiguration(); LOG.info("Converting input file: {}", input); LOG.info("Output directory: {}", output); Path inputPath = new Path(input); Path outputDir = new Path(output); fs.mkdirs(outputDir); Path tempFile = new Path(outputDir, getTempFilePrefix() + inputPath.getName()); if(fs.exists(tempFile)) { if(conf.getBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, false)) { fs.delete(tempFile, true); } else { throw new IOException("Temporary file " + tempFile + " already exists."); } } LOG.info("Using temp file: {}", tempFile); // Output file is the same as input except of dropping .avro extension if it exists and appending .parquet or .orc String outputFileName = inputPath.getName().replaceAll("\\.avro$", "") + getOutputFileSuffix(); Path finalFile = new Path(outputDir, outputFileName); LOG.info("Final path will be: {}", finalFile); // Avro reader SeekableInput seekableInput = new FsInput(inputPath, conf); DatumReader<GenericRecord> reader = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(seekableInput, reader); Schema avroSchema = fileReader.getSchema(); initializeWriter(tempFile, avroSchema, conf, context); LOG.info("Started reading input file"); long recordCount = 0; try { while (fileReader.hasNext()) { GenericRecord record = fileReader.next(); handleAvroRecord(record); context.getCounter(Counters.PROCESSED_RECORDS).increment(1); recordCount++; } } catch (Exception e) { // Various random stuff can happen while converting, so we wrap the underlying exception with more details String message = String.format( "Exception at offset %d (record %d): %s", fileReader.tell(), recordCount, e.toString() ); throw new IOException(message, e); } LOG.info("Done reading input file"); closeWriter(); LOG.info("Moving temporary file {} to final destination {}", tempFile, finalFile); fs.rename(tempFile, finalFile); if(!context.getConfiguration().getBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, false)) { LOG.info("Removing input file", inputPath); fs.delete(inputPath, true); } LOG.info("Done converting input file into output directory {}", output); }