/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.morphline.hadoop.rcfile; import java.io.DataInput; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.RCFile; import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable; import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ReflectionUtils; import org.kitesdk.morphline.api.Command; import org.kitesdk.morphline.api.CommandBuilder; import org.kitesdk.morphline.api.MorphlineCompilationException; import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.api.MorphlineRuntimeException; import org.kitesdk.morphline.api.Record; import org.kitesdk.morphline.base.Configs; import org.kitesdk.morphline.base.Fields; import org.kitesdk.morphline.base.Validator; import org.kitesdk.morphline.stdio.AbstractParser; import com.google.common.io.ByteStreams; import com.typesafe.config.Config; /** * Command for reading Record Columnar (RC) Files. RCFile can be read in two * ways * * 1. Row Wise - Emits one record for every row in the RCFile, with columns * mapped with names as provided in the column map * * 2. Column Wise - For every row split (block) in the RC File, Emits one record * for each column with all the row values for that column as a List * * For a RCFile with n rows and m columns. Row wise would emit n records and * Column wise would emit m * (row splits) records. * De-Compression is handled automatically * */ public final class ReadRCFileBuilder implements CommandBuilder { public static final String OUTPUT_MEDIA_TYPE = "application/java-rc-file-record"; public static final String RC_FILE_META_DATA = "RCFileMetaData"; @Override public Collection<String> getNames() { return Collections.singletonList("readRCFile"); } @Override public Command build(Config config, Command parent, Command child, MorphlineContext context) { return new ReadRCFile(this, config, parent, child, context); } // ///////////////////////////////////////////////////////////////////////////// // Nested classes: // ///////////////////////////////////////////////////////////////////////////// private static final class ReadRCFile extends AbstractParser { private final List<RCFileColumn> columns = new ArrayList(); private final RCFileReadMode readMode; private final boolean includeMetaData; private final Configuration conf = new Configuration(); private static final Object STREAM_PROTOCOL = "stream://"; public ReadRCFile(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) { super(builder, config, parent, child, context); this.includeMetaData = getConfigs().getBoolean(config, "includeMetaData", false); this.readMode = new Validator<RCFileReadMode>() .validateEnum( config, getConfigs().getString(config, "readMode", RCFileReadMode.row.name()), RCFileReadMode.class); for (Config columnConfig : getConfigs().getConfigList(config, "columns")) { columns.add(new RCFileColumn(columnConfig, conf)); } validateArguments(); } @Override protected boolean doProcess(Record record, InputStream in) throws IOException { Path attachmentPath = getAttachmentPath(record); SingleStreamFileSystem fs = new SingleStreamFileSystem(in, attachmentPath); RCFile.Reader reader = null; try { reader = new RCFile.Reader(fs, attachmentPath, conf); Record template = record.copy(); removeAttachments(template); template.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE); if (includeMetaData) { SequenceFile.Metadata metadata = reader.getMetadata(); if (metadata != null) { template.put(RC_FILE_META_DATA, metadata); } } switch (readMode) { case row: return readRowWise(reader, template); case column: return readColumnWise(reader, template); default : throw new IllegalStateException(); } } catch (IOException e) { throw new MorphlineRuntimeException("IOException while processing attachment " + attachmentPath.getName(), e); } finally { if (reader != null) { reader.close(); } } } private Path getAttachmentPath(Record record) { // We have more meaningful RCFile error messages if we have an attachment name String attachmentName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME); if (attachmentName == null) { attachmentName = "UNKNOWN"; } return new Path(STREAM_PROTOCOL + attachmentName); } private boolean readRowWise(final RCFile.Reader reader, final Record record) throws IOException { LongWritable rowID = new LongWritable(); while (true) { boolean next; try { next = reader.next(rowID); } catch (EOFException ex) { // We have hit EOF of the stream break; } if (!next) { break; } incrementNumRecords(); Record outputRecord = record.copy(); BytesRefArrayWritable rowBatchBytes = new BytesRefArrayWritable(); rowBatchBytes.resetValid(columns.size()); reader.getCurrentRow(rowBatchBytes); // Read all the columns configured and set it in the output record for (RCFileColumn rcColumn : columns) { BytesRefWritable columnBytes = rowBatchBytes.get(rcColumn.getInputField()); outputRecord.put(rcColumn.getOutputField(), updateColumnValue(rcColumn, columnBytes)); } // pass record to next command in chain: if (!getChild().process(outputRecord)) { return false; } } return true; } private boolean readColumnWise(RCFile.Reader reader, Record record) throws IOException { while (true) { boolean next; try { next = reader.nextBlock(); } catch (EOFException ex) { // We have hit EOF of the stream break; } if (!next) { break; } for (RCFileColumn rcColumn : columns) { BytesRefArrayWritable rowBatchBytes = reader.getColumn(rcColumn.getInputField(), null); List<Writable> rowValues = new ArrayList<Writable>(rowBatchBytes.size()); incrementNumRecords(); Record outputRecord = record.copy(); for (int rowIndex = 0; rowIndex < rowBatchBytes.size(); rowIndex++) { BytesRefWritable rowBytes = rowBatchBytes.get(rowIndex); rowValues.add(updateColumnValue(rcColumn, rowBytes)); } outputRecord.put(rcColumn.getOutputField(), rowValues); // pass record to next command in chain: if (!getChild().process(outputRecord)) { return false; } } } return true; } private Writable updateColumnValue(RCFileColumn column, BytesRefWritable bytesRef) throws IOException { if(bytesRef.getLength() == 0) { // This is a null field. return NullWritable.get(); } Writable newColumnValue = column.newWritable(); // Small optimization to bypass DataInput read if the column writable is // BytesRefWritable if (newColumnValue.getClass() == BytesRefWritable.class) { newColumnValue = bytesRef; } else { byte[] currentRowBytes = Arrays.copyOfRange(bytesRef.getData(), bytesRef.getStart(), bytesRef.getStart() + bytesRef.getLength()); DataInput dataInput = ByteStreams.newDataInput(currentRowBytes); newColumnValue.readFields(dataInput); } return newColumnValue; } // ///////////////////////////////////////////////////////////////////////////// // Nested classes: // ///////////////////////////////////////////////////////////////////////////// private static enum RCFileReadMode { column, row } // ///////////////////////////////////////////////////////////////////////////// // Nested classes: // ///////////////////////////////////////////////////////////////////////////// private static final class RCFileColumn { private final int inputField; private final String outputField; private final Class<Writable> writableClass; private final Configuration conf; public RCFileColumn(Config columnConfig, Configuration conf) { this.conf = conf; Configs configs = new Configs(); this.inputField = configs.getInt(columnConfig, "inputField"); if (inputField < 0) { throw new MorphlineCompilationException( "Invalid column inputField specified: " + inputField, columnConfig); } this.outputField = configs.getString(columnConfig, "outputField"); String writableClassString = configs.getString(columnConfig, "writableClass"); if (writableClassString == null || writableClassString.isEmpty()) { throw new MorphlineCompilationException( "No writableClass specified for column " + outputField, columnConfig); } try { Class clazz = Class.forName(writableClassString); if (!Writable.class.isAssignableFrom(clazz)) { throw new MorphlineCompilationException("writableClass provided " + writableClassString + " for column " + outputField + " does not implement " + Writable.class.getName(), columnConfig); } this.writableClass = clazz; } catch (ClassNotFoundException e) { throw new MorphlineCompilationException("Could not load class " + writableClassString + " definition", columnConfig, e); } configs.validateArguments(columnConfig); } public int getInputField() { return inputField; } public String getOutputField() { return outputField; } public Writable newWritable() { return ReflectionUtils.newInstance(writableClass, conf); } } } }