/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package datafu.pig.bags; import java.io.IOException; import org.apache.pig.AccumulatorEvalFunc; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; /** * <p> * This UDF will extract a tuple from a bag based on a specified index. * </p> * <p> * There are three input parameter: * </p> * <ol> * <li>DataBag</li> * <li>Index</li> * <li>Default tuple (Optional)</li> * </ol> * <p> * Example: * </p> * <pre> * {@code * define TupleFromBag datafu.pig.bags.TupleFromBag(); * %declare defaultTuple TOTUPLE(0,'NO_NUMBER') * * data = LOAD 'input' using PigStorage(',') AS (a:INT,b:CHARARRAY); * -- input: * (1,a) * (1,b) * (1,c) * (2,d) * (2,e) * (2,f) * (3,g) * (3,h) * (3,i) * * grouped = GROUP data BY a; * * --output: * {group: int,data: {(a: int,b: chararray)}} * (1,{(1,c),(1,b),(1,a)}) * (2,{(2,f),(2,e),(2,d)}) * (3,{(3,i),(3,h),(3,g)}) * * result1 = FOREACH grouped GENERATE * group AS a, * TupleFromBag(data, 0); * * --output: * {a: int,(a: int,b: chararray)} * (1,(1,c)) * (2,(2,f)) * (3,(3,i)) * * * result2 = FOREACH grouped GENERATE * group AS a, * TupleFromBag(data,0).b as first_b, * TupleFromBag(data,1).b as second_b; * * --output: * {a: int,first_b: chararray,second_b: chararray} * (1,c,b) * (2,f,e) * (3,i,h) * * * result3 = FOREACH grouped GENERATE * group AS a, * TupleFromBag(data,0).b as first_b, * TupleFromBag(data,3).b as forth_b; * * --output: * {a: int,first_b: chararray,forth_b: chararray} * (1,c,) * (2,f,) * (3,i,) * * result4 = FOREACH grouped GENERATE * group AS a, * TupleFromBag(data,0,$emptyTuple).b as first_b, * TupleFromBag(data,3,$emptyTuple).b as forth_b; * * --output: * {a: int,first_b: chararray,forth_b: chararray} * (1,c,NO_NUMBER) * (2,f,NO_NUMBER) * (3,i,NO_NUMBER) * } * </pre> */ public class TupleFromBag extends AccumulatorEvalFunc<Tuple> { private int tupleIndex = 0; private Tuple result = null; private Tuple defaultResult = null; @Override public void accumulate(Tuple tinput) throws IOException { if (result == null) { try{ DataBag samples = (DataBag) tinput.get(0); int index = ((Number)tinput.get(1)).intValue(); for (Tuple tuple : samples) { if(tupleIndex == index){ result = tuple; return; } tupleIndex++; } } catch (Exception e){ // no logging was done in the original class, and I preserve this behavior // however, an exception would have caused null to be returned. // instead, I silently continue and attempt to reach the desired index } if (defaultResult == null && tinput.size() == 3){ defaultResult = DataType.toTuple(tinput.get(2)); } } } @Override public Schema outputSchema(Schema input) { try { if (!(input.size() == 2 || input.size() == 3)) { throw new RuntimeException("Expected input to have two or three fields"); } if (input.getField(1).type != DataType.INTEGER ) { throw new RuntimeException("Expected an INT as second input, got: "+input.getField(1).type); } return new Schema(input.getField(0).schema); } catch (FrontendException e) { e.printStackTrace(); throw new RuntimeException(e); } } @Override public void cleanup() { tupleIndex = 0; result = null; defaultResult = null; } @Override public Tuple getValue() { return result != null ? result : defaultResult; } }