Java Examples

The following examples show how to use You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
 * java level API
 * @param input expects a numeric value to round, a number of digits to keep, and an optional rounding mode.
 * @return output returns a single numeric value, the number with only those digits retained
public Double exec(Tuple input) throws IOException {
    if (input == null || input.size() < 2)
        return null;

    try {
        Double       num    = DataType.toDouble(input.get(0));
        Integer      digits = DataType.toInteger(input.get(1));
        RoundingMode mode   = (input.size() >= 3) ?
            RoundingMode.valueOf(DataType.toInteger(input.get(2))) : RoundingMode.HALF_EVEN;
        if (num == null) return null;

        BigDecimal bdnum  = BigDecimal.valueOf(num);
        bdnum = bdnum.setScale(digits, mode);
        return bdnum.doubleValue();
    } catch (NumberFormatException nfe){
        System.err.println("Failed to process input; error - " + nfe.getMessage());
        return null;
    } catch (Exception e){
        throw new IOException("Caught exception processing input row ", e);
Example #2
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void testToDelimitedString() {
    Tuple t = mTupleFactory.newTuple();
    t.append(new Integer(1));
    t.append(new Long(2));
    t.append(new Float(1.1f));
    t.append(new Double(2.3));
    t.append("howdy howdy howdy");
    t.append("woah there");
    t.append(new Double(2000000.3000000001));
    t.append(new Float(1000000000.1000001f));
    t.append(new Long(2001010101));
    t.append(new Integer(100010101));
    try {
        String expected = "1,2,1.1,2.3,howdy howdy howdy,,woah there,2000000.3,1.0E9,2001010101,100010101";
        assertEquals(expected, t.toDelimitedString(","));
    } catch (Exception e) {
        throw new RuntimeException(e);
Example #3
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
 * Transform bytes from a byte array up to the specified length to a <code>Tuple</code>
 * @param buf the byte array
 * @param length number of bytes to consume from the byte array
 * @param fieldDel the field delimiter
 * @return tuple constructed from the bytes
public static Tuple bytesToTuple(byte[] buf, int offset, int length, byte fieldDel) {

    int start = offset;

    ArrayList<Object> protoTuple = new ArrayList<Object>();

    for (int i = offset; i < length; i++) {
        if (buf[i] == fieldDel) {
            readField(protoTuple, buf, start, i);
            start = i + 1;

    // pick up the last field
    if (start <= length) {
        readField(protoTuple, buf, start, length);

    return TupleFactory.getInstance().newTupleNoCopy(protoTuple);
Example #4
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public Integer exec(Tuple tuple) throws IOException {
    DataBag databag = (DataBag)tuple.get(0);
    if(databag == null) {
        return new Integer(0);
    int count = 0;

    Iterator<Tuple> iterator = databag.iterator();
    while(iterator.hasNext()) {;
    return new Integer(count);
Example #5
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void testRank07RankBy() throws IOException {
    String query = "A = LOAD 'test01' USING mock.Storage() AS (f1:chararray,f2:int,f3:chararray);"
        + "C = rank A by f1..f3;"
        + "store C into 'result' using mock.Storage();";

    Util.registerMultiLineQuery(pigServer, query);

    Set<Tuple> expected = ImmutableSet.of(
            tf.newTuple(ImmutableList.of((long) 1, "A", 1, "N")),
            tf.newTuple(ImmutableList.of((long) 2, "B", 2, "N")),
            tf.newTuple(ImmutableList.of((long) 3, "C", 3, "M")),
            tf.newTuple(ImmutableList.of((long) 4, "D", 4, "P")),
            tf.newTuple(ImmutableList.of((long) 5, "E", 4, "Q")),
            tf.newTuple(ImmutableList.of((long) 5, "E", 4, "Q")),
            tf.newTuple(ImmutableList.of((long) 7, "F", 7, "Q")),
            tf.newTuple(ImmutableList.of((long) 8, "F", 8, "Q")),
            tf.newTuple(ImmutableList.of((long) 8, "F", 8, "Q")),
            tf.newTuple(ImmutableList.of((long) 10, "F", 8, "T")),
            tf.newTuple(ImmutableList.of((long) 11, "G", 10, "V")));

    verifyExpected(data.get("result"), expected);
Example #6
Source File:    From datafu with Apache License 2.0 6 votes vote down vote up
public DataBag call(DataBag inputBag) throws IOException
  DataBag outputBag = BagFactory.getInstance().newDefaultBag();
  long i = start, count = 0;
  i = inputBag.size() - 1 + start;

  for (Tuple t : inputBag) {
    Tuple t1 = TupleFactory.getInstance().newTuple(t.getAll());

    if (count % 1000000 == 0) {
      count = 0;

  return outputBag;
Example #7
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void accumulate(Tuple b) throws IOException {
    try {
        DateTime curMax = max(b);
        if (curMax == null) {
        // check curMax
        if (intermediateMax == null || curMax.isAfter(intermediateMax)) {
            intermediateMax = curMax;

    } catch (ExecException ee) {
        throw ee;
    } catch (Exception e) {
        int errCode = 2106;
        String msg = "Error while computing max in " + this.getClass().getSimpleName();
        throw new ExecException(msg, errCode, PigException.BUG, e);
Example #8
Source File:    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void load(String out, int colsToLoad) throws ExecException, IOException {
  long t0 = System.currentTimeMillis();
  StringBuilder schemaString = new StringBuilder("a0: chararray");
  for (int i = 1; i < colsToLoad; i++) {
    schemaString.append(", a" + i + ": chararray");
  PigServer pigServer = new PigServer(ExecType.LOCAL);
  pigServer.registerQuery("B = LOAD '"+out+"' USING "+ParquetLoader.class.getName()+"('"+schemaString+"');");
  pigServer.registerQuery("C = FOREACH (GROUP B ALL) GENERATE COUNT(B);");
  Iterator<Tuple> it = pigServer.openIterator("C");
  if (!it.hasNext()) {
    throw new RuntimeException("Job failed: no tuple to read");
  Long count = (Long);

  assertEquals(ROW_COUNT, count.longValue());
  long t1 = System.currentTimeMillis();
  results.append((t1-t0)+" ms to read "+colsToLoad+" columns\n");
Example #9
Source File:    From Surus with Apache License 2.0 6 votes vote down vote up
private Boolean approximateCompareBags(DataBag inputBag1, DataBag inputBag2) throws ExecException {
  	// Hardcode Acceptable Error
  	double errorLimit = 0.0000001;

Iterator<Tuple> iter1 = inputBag1.iterator();
Iterator<Tuple> iter2 = inputBag2.iterator();
while (iter1.hasNext()) {
	Tuple tuple1 =;
	Tuple tuple2 =;
	// Check error
	if (Math.abs((Double) tuple1.get(0) - (Double) tuple2.get(0)) > errorLimit) return false;
	// TODO: Add unit test for differenced case
	//if (Math.abs((Double) tuple1.get(1) - (Double) tuple2.get(1)) > errorLimit) return false;
	if (Math.abs((Double) tuple1.get(2) - (Double) tuple2.get(2)) > errorLimit) return false;
	if (Math.abs((Double) tuple1.get(3) - (Double) tuple2.get(3)) > errorLimit) return false;
	if (Math.abs((Double) tuple1.get(4) - (Double) tuple2.get(4)) > errorLimit) return false;

  	return true;
Example #10
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
 * java level API
 * @param input expects a tuple containing two numeric DataAtom value
 * @param output returns a single numeric DataAtom value, which is 
 * first floating-point argument with the sign of the second 
 * floating-point argument.
public Double exec(Tuple input) throws IOException {
       if (input == null || input.size() < 2)
           return null;
       if (input.get(0) == null || input.get(1) == null) {
           return null;
		double first =  (Double)input.get(0);
		double second = (Double)input.get(1);
		return Math.copySign(first, second);
       } catch(Exception e){
           throw new IOException("Caught exception processing input row ", e);
Example #11
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void testLoadStoreMoreDataType() throws Exception {
    pigServer.registerQuery("A = load '" + basedir + "orc-file-11-format.orc'" + " using OrcStorage();" );
    pigServer.registerQuery("B = foreach A generate boolean1..double1, '' as bytes1, string1..;");"B", OUTPUT4, "OrcStorage");

    // A bug in ORC InputFormat does not allow empty file in input directory
    fs.delete(new Path(OUTPUT4, "_SUCCESS"), true);

    pigServer.registerQuery("A = load '" + OUTPUT4 + "' using OrcStorage();" );
    Iterator<Tuple> iter = pigServer.openIterator("A");
    Tuple t =;
    assertTrue(t.toString().startsWith("(false,1,1024,65536,9223372036854775807,1.0,-15.0," +
Example #12
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void testNullTupleCols() throws Exception {
    String inputFileName = "TestProject-testNullTupleCols-input.txt";
    String input[] = { "1\t(hello,world)", "2\t(good)", "3" };
    Util.createLocalInputFile(inputFileName, input);
    // PigStorage will return null as the value for the tuple field in the
    // second record since it does not comply with the schema and in the
    // third record since the field is absent
    String query = "a = load '" + inputFileName + "' as (i:int, " +
            "t:tuple(s1:chararray, s2:chararray));" +
            "b = foreach a generate t.s1, t.s2;";

    PigServer ps = new PigServer(ExecType.LOCAL);
    Util.registerMultiLineQuery(ps, query);
    Iterator<Tuple> it = ps.openIterator("b");
    Tuple[] expectedResults = new Tuple[] {
                    (Tuple)Util.getPigConstant("('hello', 'world')"),
                    (Tuple)Util.getPigConstant("(null, null)"),
                    (Tuple)Util.getPigConstant("(null, null)")
    int i = 0;
    while (it.hasNext()) {
Example #13
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void accumulate(Tuple b) throws IOException {
    try {
        BigDecimal curVal = doTupleWork(b, opProvider);
        if (curVal == null) {
        if (intermediateVal == null) {
            intermediateVal = getSeed(opProvider.getOp());
        intermediateVal = doWork(intermediateVal, curVal, opProvider.getOp());
    } catch (ExecException ee) {
        throw ee;
    } catch (Exception e) {
        int errCode = 2106;
        throw new ExecException("Error executing function on BigDecimal", errCode, PigException.BUG, e);
Example #14
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void testJoin() throws Exception{
    File f1 = Util.createFile(new String[]{"a:1","b:1","a:1"});
    File f2 = Util.createFile(new String[]{"b","b","a"});

    pigServer.registerQuery("a = load '"
            + Util.generateURI(f1.toString(), pigContext) + "' using "
            + PigStorage.class.getName() + "(':');");
    pigServer.registerQuery("b = load '"
            + Util.generateURI(f2.toString(), pigContext) + "';");
    pigServer.registerQuery("c = cogroup a by $0, b by $0;");
    pigServer.registerQuery("d = foreach c generate flatten($1),flatten($2);");

    Iterator<Tuple> iter = pigServer.openIterator("d");
    int count = 0;
        Tuple t =;
    Assert.assertEquals(count, 4);
Example #15
Source File:    From stratio-cassandra with Apache License 2.0 6 votes vote down vote up
public void testCassandraStorageCounterCF() throws IOException, ClassNotFoundException, TException, TimedOutException, NotFoundException, InvalidRequestException, NoSuchFieldException, UnavailableException, IllegalAccessException, InstantiationException, AuthenticationException, AuthorizationException
    pig.registerQuery("rows = LOAD 'cassandra://thriftKs/SomeApp?" + defaultParameters + "' USING CassandraStorage();");

    //Test counter column family support
    pig.registerQuery("CC = load 'cassandra://thriftKs/CC?" + defaultParameters + "' using CassandraStorage();");
    pig.registerQuery("total_hits = foreach CC generate key, SUM(columns.value);");
    Iterator<Tuple> it = pig.openIterator("total_hits");
    if (it.hasNext()) {
        Tuple t =;
        Assert.assertEquals(t.get(0), "chuck");
        Assert.assertEquals(t.get(1), 4l);
Example #16
Source File:    From datafu with Apache License 2.0 6 votes vote down vote up
public DataBag exec(Tuple input) throws IOException {
  DataBag bagOfSamples = (DataBag) input.get(0);
  for (Tuple innerTuple : bagOfSamples) {
    DataBag samples = (DataBag) innerTuple.get(0);        
    for (Tuple sample : samples) {
      // use the same score as previously generated
  DataBag output = BagFactory.getInstance().newDefaultBag();  
  for (ScoredTuple scoredTuple : getReservoir()) {
    // output the original tuple

  return output;
Example #17
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public Tuple illustratorMarkup(Object in, Object out, int eqClassIndex) {
    if (illustrator != null) {
        ExampleTuple tOut = new ExampleTuple((Tuple) out);
        LineageTracer lineageTracer = illustrator.getLineage();
        if (illustrator.getEquivalenceClasses() == null) {
            LinkedList<IdentityHashSet<Tuple>> equivalenceClasses = new LinkedList<IdentityHashSet<Tuple>>();
            for (int i = 0; i < numInputs; ++i) {
                IdentityHashSet<Tuple> equivalenceClass = new IdentityHashSet<Tuple>();
            illustrator.setEquivalenceClasses(equivalenceClasses, parent);
        tOut.synthetic = false; // not expect this to be really used
        illustrator.addData((Tuple) tOut);
        return tOut;
    } else
        return (Tuple) out;
Example #18
Source File:    From spork with Apache License 2.0 6 votes vote down vote up
public void accumulate(Tuple b) throws IOException {
    try {
        Long curVal = doTupleWork(b, opProvider);
        if (curVal == null) {
        if (intermediateVal == null) {
            intermediateVal = getSeed(opProvider.getOp());
        intermediateVal = doWork(intermediateVal, curVal, opProvider.getOp());
    } catch (ExecException ee) {
        throw ee;
    } catch (Exception e) {
        int errCode = 2106;
        throw new ExecException("Error executing function on Longs", errCode, PigException.BUG, e);
Example #19
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
public Tuple exec(Tuple input) throws IOException {
    try {
        return tfact.newTuple(doTupleWork(input, this));
    } catch (ExecException ee) {
        throw ee;
    } catch (Exception e) {
        int errCode = 2106;
        throw new ExecException("Error executing function on Floats", errCode, PigException.BUG, e);
Example #20
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
public void poSortAscInt( DataBag input) throws ExecException {

        List<PhysicalPlan> sortPlans = new LinkedList<PhysicalPlan>();
        POProject pr1 = new POProject(new OperatorKey("", r.nextLong()), -1, 1);
        PhysicalPlan expPlan = new PhysicalPlan();
        List<Boolean> mAscCols = new LinkedList<Boolean>();
        PORead read = new PORead(new OperatorKey("", r.nextLong()), input);
        List<PhysicalOperator> inputs = new LinkedList<PhysicalOperator>();
        POSort sort = new POSort(new OperatorKey("", r.nextLong()), -1, inputs,
                sortPlans, mAscCols, null);
        Tuple t = null;
        Result res1 = sort.getNextTuple();
        // System.out.println(res1.result);
        Result res2 = sort.getNextTuple();
        while (res2.returnStatus != POStatus.STATUS_EOP) {
            Object i1 = ((Tuple) res1.result).get(1);
            Object i2 = ((Tuple) res2.result).get(1);
            int i =, i2);
            assertEquals(true, (i <= 0));
            // System.out.println(res2.result);
            res1 = res2;
            res2 = sort.getNextTuple();
Example #21
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
static protected Tuple combine(DataBag values) throws ExecException {
    long sum = 0;
    long count = 0;

    // combine is called from Intermediate and Final
    // In either case, Initial would have been called
    // before and would have sent in valid tuples
    // Hence we don't need to check if incoming bag
    // is empty

    Tuple output = mTupleFactory.newTuple(2);
    boolean sawNonNull = false;
    for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
        Tuple t =;
        Long l = (Long)t.get(0);
        // we count nulls in avg as contributing 0
        // a departure from SQL for performance of 
        // COUNT() which implemented by just inspecting
        // size of the bag
        if(l == null) {
            l = 0L;
        } else {
            sawNonNull = true;
        sum += l;
        count += (Long)t.get(1);
    if(sawNonNull) {
        output.set(0, Long.valueOf(sum));
    } else {
        output.set(0, null);
    output.set(1, Long.valueOf(count));
    return output;
Example #22
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
public Tuple exec(Tuple input) throws IOException {
    try {
        DataBag b = (DataBag)input.get(0);
        return combine(b);
    } catch (ExecException ee) {
        throw ee;
    } catch (Exception e) {
        int errCode = 2106;
        String msg = "Error while computing average in " + this.getClass().getSimpleName();
        throw new ExecException(msg, errCode, PigException.BUG, e);           
Example #23
Source File:    From Cubert with Apache License 2.0 5 votes vote down vote up
public SerializedTupleStore(BlockSchema schema,String[] comparatorKeys) throws IOException
    this.schema = schema;
    this.comparatorKeys = comparatorKeys;
    this.createOffsetList = (comparatorKeys != null);
    this.pbaos = new PagedByteArrayOutputStream(CHUNK_SIZE);

    if (PhaseContext.getConf().getBoolean(CubertStrings.USE_COMPACT_SERIALIZATION, false) && schema.isFlatSchema())
        serializer = new CompactSerializer<Tuple>(schema);
        writablesDeserializer = new CompactDeserializer<Tuple>(schema);
        deserializer = new CompactDeserializer<Tuple>(schema);
        serializer = new DefaultTupleSerializer();
        deserializer = new DefaultTupleDeserializer();
        writablesDeserializer = deserializer;

    if (createOffsetList)
        startOffsetList = new ArrayList<Integer>();
        keyIndices = new int[comparatorKeys.length];
        for (int i = 0; i < keyIndices.length; i++)
            keyIndices[i] = schema.getIndex(comparatorKeys[i]);

    reader = new SerializedTupleStoreReader(pbaos.getPagedByteArray(), true);
Example #24
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
public DateTime exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.get(0) == null) {
        return null;
    String dtStr = DataType.toString(input.get(0));
    //DateTimeZone dtz = extractDateTimeZone(dtStr);
    //The timezone in the customized format is not predictable
    DateTimeFormatter dtf = DateTimeFormat.forPattern(DataType
    //if (dtz == null) {
        return dtf.parseDateTime(dtStr);
    //} else {
    //    return dtf.withZone(dtz).parseDateTime(dtStr);
Example #25
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
private DataBag jsToPigBag(Scriptable array, Schema schema, int depth) throws FrontendException, ExecException {
    debugConvertJSToPig(depth, "Bag", array, schema);
    if (schema.size() == 1 && schema.getField(0).type == DataType.TUPLE) {
        schema = schema.getField(0).schema;
    List<Tuple> bag = new ArrayList<Tuple>();
    for (Object id : array.getIds()) {
        Scriptable arrayValue = (Scriptable)array.get(((Integer)id).intValue(), null);
        bag.add(jsToPigTuple(arrayValue, schema, depth + 1));
    DataBag result = BagFactory.getInstance().newDefaultBag(bag);
    debugReturn(depth, result);
    return result;
Example #26
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
public RecordWriter<WritableComparable, Tuple> getRecordWriter(
        TaskAttemptContext context) throws IOException,
        InterruptedException {

    Configuration conf = context.getConfiguration();

    FileSystem fs = FileSystem.get(conf);
    Path file = this.getDefaultWorkFile(context, "");
    FSDataOutputStream fileOut = fs.create(file, false);

    IndexManager indexManager = new IndexManager(offsetsToIndexKeys);
    indexManager.createIndexFile(fs, file);
    return new IndexedStorageRecordWriter(fileOut, this.fieldDelimiter, indexManager);
Example #27
Source File:    From datafu with Apache License 2.0 5 votes vote down vote up
public void testL1UDFSparse() throws Exception

  RandomGenerator rg = new JDKRandomGenerator();
  RandomData rd = new RandomDataImpl(rg);
  int n = 1000;
  List<RealVector> vectors = LSHTest.getVectors(rd, 1000, n);
  PigTest test = createPigTestFromString(l1SparseTest);
  writeLinesToFile("input", getSparseLines(vectors));
  List<RealVector> queries = LSHTest.getVectors(rd, 1000, 10);
  writeLinesToFile("queries", getSparseLines(queries));
  List<Tuple> neighbors = this.getLinesForAlias(test, "NEIGHBOR_CNT");
  Assert.assertEquals( queries.size(), neighbors.size() );
  for(long cnt : getCounts(neighbors))
    Assert.assertTrue(cnt >= 3);
  Distance d = new Distance()

    public double distance(RealVector v1, RealVector v2) {
      return L1.distance(v1, v2);

  verifyPoints(neighbors, d, 1000);
Example #28
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
 * While tuples are collected, they are counted one by one by a global counter per task.
public void collect(Context context, Tuple tuple)
throws InterruptedException, IOException {
    context.write(null, tuple);
    PigStatusReporter reporter = PigStatusReporter.getInstance();
    if (reporter != null) {
                + context.getJobID().toString(), taskID, 1);
Example #29
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
 * For generating a sample dataset
private List<Tuple> genDataSetFile3() throws IOException {

    int dataLength = 256;
    List<Tuple> tuples = Lists.newArrayList();

    DecimalFormat formatter = new DecimalFormat("0000000");
    for (int i = 0; i < dataLength; i++) {
        tuples.add(tuple(formatter.format(i), formatter.format(i % 20)));

    return tuples;
Example #30
Source File:    From spork with Apache License 2.0 5 votes vote down vote up
 * For each entry in rawInputMap, feed the list of tuples into the aggregator funcs
 * and add the results to processedInputMap. Remove the entries from rawInputMap as we go.
 * @throws ExecException
private int aggregate(Map<Object, List<Tuple>> fromMap, Map<Object, List<Tuple>> toMap, int numEntriesInTarget) throws ExecException {
    Iterator<Map.Entry<Object, List<Tuple>>> iter = fromMap.entrySet().iterator();
    while (iter.hasNext()) {
        Map.Entry<Object, List<Tuple>> entry =;
        Tuple valueTuple = createValueTuple(entry.getKey(), entry.getValue());
        Result res = getOutput(entry.getKey(), valueTuple);
        addKeyValToMap(toMap, entry.getKey(), getAggResultTuple(res.result));
    return numEntriesInTarget;