/* * Modifications Copyright 2019 Graz University of Technology * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.tugraz.sysds.runtime.instructions.spark; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.spark.api.java.JavaPairRDD; import org.tugraz.sysds.common.Types.DataType; import org.tugraz.sysds.common.Types.ValueType; import org.tugraz.sysds.hops.recompile.Recompiler; import org.tugraz.sysds.runtime.DMLRuntimeException; import org.tugraz.sysds.runtime.controlprogram.caching.CacheableData; import org.tugraz.sysds.runtime.controlprogram.caching.FrameObject; import org.tugraz.sysds.runtime.controlprogram.context.ExecutionContext; import org.tugraz.sysds.runtime.controlprogram.context.SparkExecutionContext; import org.tugraz.sysds.runtime.instructions.InstructionUtils; import org.tugraz.sysds.runtime.instructions.cp.CPOperand; import org.tugraz.sysds.runtime.instructions.spark.utils.FrameRDDConverterUtils; import org.tugraz.sysds.runtime.instructions.spark.utils.RDDConverterUtils; import org.tugraz.sysds.runtime.matrix.data.FrameBlock; import org.tugraz.sysds.runtime.matrix.data.InputInfo; import org.tugraz.sysds.runtime.matrix.data.MatrixBlock; import org.tugraz.sysds.runtime.matrix.data.MatrixIndexes; import org.tugraz.sysds.runtime.matrix.operators.Operator; import org.tugraz.sysds.runtime.meta.DataCharacteristics; import org.tugraz.sysds.runtime.meta.MetaDataFormat; import org.tugraz.sysds.utils.Statistics; public class CSVReblockSPInstruction extends UnarySPInstruction { private int _blen; private boolean _hasHeader; private String _delim; private boolean _fill; private double _fillValue; protected CSVReblockSPInstruction(Operator op, CPOperand in, CPOperand out, int br, int bc, boolean hasHeader, String delim, boolean fill, double fillValue, String opcode, String instr) { super(SPType.CSVReblock, op, in, out, opcode, instr); _blen = br; _blen = bc; _hasHeader = hasHeader; _delim = delim; _fill = fill; _fillValue = fillValue; } public static CSVReblockSPInstruction parseInstruction(String str) { String opcode = InstructionUtils.getOpCode(str); if( !opcode.equals("csvrblk") ) throw new DMLRuntimeException("Incorrect opcode for CSVReblockSPInstruction:" + opcode); // Example parts of CSVReblockSPInstruction: // [csvrblk, pREADmissing_val_maps·MATRIX·DOUBLE, _mVar37·MATRIX·DOUBLE, // 1000, 1000, false, ,, true, 0.0] String parts[] = InstructionUtils.getInstructionPartsWithValueType(str); CPOperand in = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[2]); int blen = Integer.parseInt(parts[3]); boolean hasHeader = Boolean.parseBoolean(parts[4]); String delim = parts[5]; boolean fill = Boolean.parseBoolean(parts[6]); double fillValue = Double.parseDouble(parts[7]); return new CSVReblockSPInstruction(null, in, out, blen, blen, hasHeader, delim, fill, fillValue, opcode, str); } @Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext) ec; //sanity check input info CacheableData<?> obj = sec.getCacheableData(input1.getName()); MetaDataFormat iimd = (MetaDataFormat) obj.getMetaData(); if (iimd.getInputInfo() != InputInfo.CSVInputInfo) { throw new DMLRuntimeException("The given InputInfo is not implemented for " + "CSVReblockSPInstruction:" + iimd.getInputInfo()); } //set output characteristics DataCharacteristics mcIn = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); mcOut.set(mcIn.getRows(), mcIn.getCols(), _blen); //check for in-memory reblock (w/ lazy spark context, potential for latency reduction) if( Recompiler.checkCPReblock(sec, input1.getName()) ) { if( input1.getDataType() == DataType.MATRIX ) Recompiler.executeInMemoryMatrixReblock(sec, input1.getName(), output.getName()); else if( input1.getDataType() == DataType.FRAME ) Recompiler.executeInMemoryFrameReblock(sec, input1.getName(), output.getName()); Statistics.decrementNoOfExecutedSPInst(); return; } //execute matrix/frame csvreblock JavaPairRDD<?,?> out = null; if( input1.getDataType() == DataType.MATRIX ) out = processMatrixCSVReblockInstruction(sec, mcOut); else if( input1.getDataType() == DataType.FRAME ) out = processFrameCSVReblockInstruction(sec, mcOut, ((FrameObject)obj).getSchema()); // put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); } @SuppressWarnings("unchecked") protected JavaPairRDD<MatrixIndexes,MatrixBlock> processMatrixCSVReblockInstruction(SparkExecutionContext sec, DataCharacteristics mcOut) { //get input rdd (needs to be longwritable/text for consistency with meta data, in case of //serialization issues create longwritableser/textser as serializable wrappers JavaPairRDD<LongWritable, Text> in = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForMatrixObject(sec.getMatrixObject(input1), InputInfo.CSVInputInfo); //reblock csv to binary block return RDDConverterUtils.csvToBinaryBlock(sec.getSparkContext(), in, mcOut, _hasHeader, _delim, _fill, _fillValue); } @SuppressWarnings("unchecked") protected JavaPairRDD<Long,FrameBlock> processFrameCSVReblockInstruction(SparkExecutionContext sec, DataCharacteristics mcOut, ValueType[] schema) { //get input rdd (needs to be longwritable/text for consistency with meta data, in case of //serialization issues create longwritableser/textser as serializable wrappers JavaPairRDD<LongWritable, Text> in = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForFrameObject(sec.getFrameObject(input1), InputInfo.CSVInputInfo); //reblock csv to binary block return FrameRDDConverterUtils.csvToBinaryBlock(sec.getSparkContext(), in, mcOut, schema, _hasHeader, _delim, _fill, _fillValue); } }