Java Code Examples for org.apache.pig.data.DataBag#size()

The following examples show how to use org.apache.pig.data.DataBag#size() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: EmptyBagToNull.java    From datafu with Apache License 2.0 6 votes vote down vote up
@Override
public DataBag exec(Tuple tuple) throws IOException
{
  if (tuple.size() == 0 || tuple.get(0) == null)
    return null;
  Object o = tuple.get(0);
  if (o instanceof DataBag)
  {
    DataBag bag = (DataBag)o;
    if (bag.size() == 0)
    {
      return null;
    }
    else
    {
      return bag;
    }
  }
  else
    throw new IllegalArgumentException("expected a null or a bag");
}
 
Example 2
Source File: ReverseEnumerate.java    From datafu with Apache License 2.0 6 votes vote down vote up
public DataBag call(DataBag inputBag) throws IOException
{
  DataBag outputBag = BagFactory.getInstance().newDefaultBag();
  long i = start, count = 0;
  i = inputBag.size() - 1 + start;

  for (Tuple t : inputBag) {
    Tuple t1 = TupleFactory.getInstance().newTuple(t.getAll());
    t1.append(i);
    outputBag.add(t1);

    if (count % 1000000 == 0) {
      outputBag.spill();
      count = 0;
    }
    i--;
    count++;
  }

  return outputBag;
}
 
Example 3
Source File: PhysicalOperator.java    From spork with Apache License 2.0 6 votes vote down vote up
public Result getNextDataBag() throws ExecException {
    Result val = new Result();
    DataBag tmpBag = BagFactory.getInstance().newDefaultBag();
    for (Result ret = getNextTuple(); ret.returnStatus != POStatus.STATUS_EOP; ret = getNextTuple()) {
        if (ret.returnStatus == POStatus.STATUS_ERR) {
            return ret;
        } else if (ret.returnStatus == POStatus.STATUS_NULL) {
            continue;
        } else {
            tmpBag.add((Tuple) ret.result);
        }
    }
    val.result = tmpBag;
    val.returnStatus = (tmpBag.size() == 0)? POStatus.STATUS_EOP : POStatus.STATUS_OK;
    return val;
}
 
Example 4
Source File: TestHelper.java    From spork with Apache License 2.0 6 votes vote down vote up
public static boolean compareBags(DataBag db1, DataBag db2) {
    if (db1.size() != db2.size())
        return false;
    
    boolean equal = true;
    for (Tuple tuple : db2) {
        boolean contains = false;
        for (Tuple tuple2 : db1) {
            if (tuple.compareTo(tuple2) == 0) {
                contains = true;
                break;
            }
        }
        if (!contains) {
            equal = false;
            break;
        }
    }
    return equal;
}
 
Example 5
Source File: AlgebraicFloatMathBase.java    From spork with Apache License 2.0 6 votes vote down vote up
protected static Float doTupleWork(Tuple input, KnownOpProvider opProvider) throws ExecException {
    DataBag values = (DataBag)input.get(0);
    // if we were handed an empty bag, return NULL
    // this is in compliance with SQL standard
    if(values.size() == 0) {
        return null;
    }
    Float sofar = AlgebraicFloatMathBase.getSeed(opProvider.getOp());
    boolean sawNonNull = false;
    for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
        Tuple t = it.next();
        try {
            Float d = (Float)(t.get(0));
            if (d == null) continue;
            sawNonNull = true;
            sofar = doWork(sofar, d, opProvider.getOp());
        }catch(RuntimeException exp) {
            int errCode = 2103;
            throw new ExecException("Problem doing work on Floats", errCode, PigException.BUG, exp);
        }
    }
    return sawNonNull ? sofar : null;
}
 
Example 6
Source File: AlgebraicIntMathBase.java    From spork with Apache License 2.0 6 votes vote down vote up
protected static Integer doTupleWork(Tuple input, KnownOpProvider opProvider) throws ExecException {
    DataBag values = (DataBag)input.get(0);
    // if we were handed an empty bag, return NULL
    // this is in compliance with SQL standard
    if(values.size() == 0) {
        return null;
    }
    int sofar = AlgebraicIntMathBase.getSeed(opProvider.getOp());
    boolean sawNonNull = false;
    for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
        Tuple t = it.next();
        try {
            Integer d = (Integer)(t.get(0));
            if (d == null) continue;
            sawNonNull = true;
            sofar = doWork(sofar, d, opProvider.getOp());
        }catch(RuntimeException exp) {
            int errCode = 2103;
            throw new ExecException("Problem doing work on Doubles", errCode, PigException.BUG, exp);
        }
    }
    return sawNonNull ? sofar : null;
}
 
Example 7
Source File: AlgebraicBigIntegerMathBase.java    From spork with Apache License 2.0 6 votes vote down vote up
protected static BigInteger doTupleWork(Tuple input, KnownOpProvider opProvider) throws ExecException {
    DataBag values = (DataBag)input.get(0);
    // if we were handed an empty bag, return NULL
    // this is in compliance with SQL standard
    if(values.size() == 0) {
        return null;
    }
    BigInteger sofar = AlgebraicBigIntegerMathBase.getSeed(opProvider.getOp());
    boolean sawNonNull = false;
    for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
        Tuple t = it.next();
        try {
            Number n = (Number)(t.get(0));
            if (n == null) continue;
            BigInteger d = (BigInteger) n;
            sawNonNull = true;
            sofar = doWork(sofar, d, opProvider.getOp());
        } catch(RuntimeException exp) {
            int errCode = 2103;
            throw new ExecException("Problem doing work on BigInteger", errCode, PigException.BUG, exp);
        }
    }
    return sawNonNull ? sofar : null;
}
 
Example 8
Source File: DisplayExamples.java    From spork with Apache License 2.0 5 votes vote down vote up
static String[][] MakeArray(Operator op, DataBag bag)
        throws Exception {
    int rows = (int) bag.size();
    int cols = ((LogicalRelationalOperator)op).getSchema().getFields().size();
    String[][] table = new String[rows][cols];
    Iterator<Tuple> it = bag.iterator();
    for (int i = 0; i < rows; ++i) {
        Tuple t = it.next();
        for (int j = 0; j < cols; ++j) {
            table[i][j] = ShortenField(t.get(j));
        }
    }
    return table;
}
 
Example 9
Source File: AlgebraicByteArrayMathBase.java    From spork with Apache License 2.0 5 votes vote down vote up
protected static Double doTupleWork(Tuple input, KnownOpProvider opProvider, byte expectedType)
        throws ExecException {
    DataBag values = (DataBag)input.get(0);
    // if we were handed an empty bag, return NULL
    // this is in compliance with SQL standard
    if(values.size() == 0) {
        return null;
    }
    double sofar = AlgebraicByteArrayMathBase.getSeed(opProvider.getOp());
    boolean sawNonNull = false;
    for (Iterator<Tuple> it = values.iterator(); it.hasNext();) {
        Tuple t = it.next();
        try {
            Double d;
            switch (expectedType) {
            case DataType.BYTEARRAY:
                DataByteArray dba = (DataByteArray)t.get(0);
                d = dba != null ? Double.valueOf(dba.toString()): null;
                break;
            case DataType.DOUBLE:
                d = (Double) t.get(0);
                break;
            default:
                throw new ExecException("Unexpected type in AlgebraicByteArrayMath "
                        + DataType.findTypeName(expectedType));
            }
            if (d == null) continue;
            sawNonNull = true;
            sofar = doWork(sofar, d, opProvider.getOp());
        }catch(RuntimeException exp) {
            int errCode = 2103;
            throw new ExecException("Problem doing work on Doubles", errCode, PigException.BUG, exp);
        }
    }
    return sawNonNull ? sofar : null;
}
 
Example 10
Source File: TestStore.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * @param inpD
 * @throws IOException
 */
private void setUpInputFileOnCluster(DataBag inpD) throws IOException {
    String[] data = new String[(int) inpD.size()];
    int i = 0;
    for (Tuple tuple : inpD) {
        data[i] = toDelimitedString(tuple, "\t");
        i++;
    }
    Util.createInputFile(cluster, inputFileName, data);
}
 
Example 11
Source File: FlattenBagOperator.java    From Cubert with Apache License 2.0 5 votes vote down vote up
private void initCurrentTuple(Tuple inTuple) throws ExecException
{
    // TODO Auto-generated method stub

    this.inTuple = inTuple;
    this.odometerIterators.clear();
    for (int columnId : columnIndexArray)
    {
        FlattenType ftype = flattenPositions.get(columnId);
        if (ftype == null || !isFlattenBag(ftype))
        {

            continue;
        }
        DataBag dbag = (DataBag) (inTuple.get(columnId));// Rui. change outTuple to
                                                         // inTuple
        Iterator<Tuple> tupleIt;

        // Deal with null and empty bags as if they contained a single null tuple.
        if (dbag == null || dbag.size() == 0)
            tupleIt = nullBag.iterator();
        else
            tupleIt = dbag.iterator();
        odometerIterators.add(tupleIt);
    }

    seedOutTuple();// Rui. move it here.

}
 
Example 12
Source File: PageRank.java    From datafu with Apache License 2.0 4 votes vote down vote up
@Override
public void accumulate(Tuple t) throws IOException
{
  if (aborted)
  {
    return;
  }
  
  DataBag bag = (DataBag) t.get(0);
  if (bag == null || bag.size() == 0)
    return;
  
  for (Tuple sourceTuple : bag) 
  {
    Integer sourceId = (Integer)sourceTuple.get(0);
    DataBag edges = (DataBag)sourceTuple.get(1);
    Double nodeBias = null;
    if (enableNodeBiasing)
    {
      nodeBias = (Double)sourceTuple.get(2);
    }

    ArrayList<Map<String,Object>> edgesMapList = new ArrayList<Map<String, Object>>();

    for (Tuple edgeTuple : edges)
    {
      Integer destId = (Integer)edgeTuple.get(0);
      Double weight = (Double)edgeTuple.get(1);
      HashMap<String,Object> edgeMap = new HashMap<String, Object>();
      edgeMap.put("dest",destId);
      edgeMap.put("weight",weight);
      edgesMapList.add(edgeMap);
    }

    if (enableNodeBiasing)
    {
      graph.addNode(sourceId, edgesMapList, nodeBias.floatValue());
    }
    else
    {
      graph.addNode(sourceId, edgesMapList);
    }

    if (graph.nodeCount() + graph.edgeCount() > maxNodesAndEdges)
    {
      System.out.println(String.format("There are too many nodes and edges (%d + %d > %d). Aborting.", graph.nodeCount(), graph.edgeCount(), maxNodesAndEdges));
      aborted = true;
      break;
    }

    reporter.progress();
  }
}
 
Example 13
Source File: AugmentBaseDataVisitor.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public void visit(LOLimit lm) throws FrontendException {
    if (!limit) // not augment for LIMIT in this traversal
        return;

    if (oriLimitMap == null)
        oriLimitMap = new HashMap<LOLimit, Long>();

    DataBag outputConstraints = outputConstraintsMap.get(lm);
    outputConstraintsMap.remove(lm);

    DataBag inputConstraints = outputConstraintsMap.get(lm.getInput((LogicalPlan) plan));
    if (inputConstraints == null) {
        inputConstraints = BagFactory.getInstance().newDefaultBag();
        outputConstraintsMap.put(lm.getInput((LogicalPlan) plan), inputConstraints);
    }

    DataBag inputData = derivedData.get(lm.getInput((LogicalPlan) plan));

    if (outputConstraints != null && outputConstraints.size() > 0) { // there
        // 's
        // one
        // or
        // more
        // output
        // constraints
        // ;
        // generate
        // corresponding
        // input
        // constraints
        for (Iterator<Tuple> it = outputConstraints.iterator(); it
              .hasNext();) {
            inputConstraints.add(it.next());
         // ... plus one more if only one
         if (inputConstraints.size() == 1) {
            inputConstraints.add(inputData.iterator().next());
            ((PreOrderDepthFirstWalker) currentWalker).setBranchFlag();
         }
      }
    } else if (inputConstraints.size() == 0){
        // add all input to input constraints ...
        inputConstraints.addAll(inputData);
        // ... plus one more if only one
        if (inputConstraints.size() == 1) {
            inputConstraints.add(inputData.iterator().next());
            ((PreOrderDepthFirstWalker) currentWalker).setBranchFlag();
        }
    }
    POLimit poLimit = (POLimit) logToPhysMap.get(lm);
    oriLimitMap.put(lm, Long.valueOf(poLimit.getLimit()));
    poLimit.setLimit(inputConstraints.size()-1);
    lm.setLimit(poLimit.getLimit());
}
 
Example 14
Source File: SetDifference.java    From datafu with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public DataBag exec(Tuple input) throws IOException
{
  if (input.size() < 2)
  {
    throw new RuntimeException("Expected at least two inputs, but found " + input.size());
  }
  
  for (Object o : input)
  {
    if (o != null && !(o instanceof DataBag))
    {
      throw new RuntimeException("Inputs must be bags");
    }
  }

  DataBag outputBag = bagFactory.newDefaultBag();
  
  DataBag bag1 = (DataBag)input.get(0);
  DataBag bag2 = (DataBag)input.get(1);
  
  if (bag1 == null || bag1.size() == 0)
  {
    return outputBag;
  }
  // optimization
  else if (input.size() == 2 && (bag2 == null || bag2.size() == 0))
  {
    return bag1;
  }
  
  PriorityQueue<Pair> pq = loadBags(input);
  
  Tuple lastData = null;

  while (true) 
  {
    Pair nextPair = pq.peek();
    
    // ignore data we've already encountered
    if (nextPair.data.compareTo(lastData) != 0)
    {
      // Only take data from the first bag, where there are no other
      // bags that have the same data.
      if (nextPair.index.equals(0) && countMatches(pq) == 0)
      {
        outputBag.add(nextPair.data);
        lastData = nextPair.data;
      }
    }

    Pair p = pq.poll();      
    
    // only put the bag back into the queue if it still has data
    if (p.hasNext())
    {
      p.next();
      pq.offer(p);
    }
    else if (p.index.equals(0))
    {
      // stop when we exhaust all elements from the first bag
      break;
    }
  }

  return outputBag;
}
 
Example 15
Source File: Over.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public Object exec(Tuple input) throws IOException {
    DataBag inbag = (DataBag)input.get(0);
    if (inbag.size() == 0) return null;
    return inbag.iterator().next().get(0);
}
 
Example 16
Source File: AugmentBaseDataVisitor.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public void visit(LOForEach forEach) throws FrontendException {
    if (limit && !((PreOrderDepthFirstWalker) currentWalker).getBranchFlag())
        return;
    DataBag outputConstraints = outputConstraintsMap.get(forEach);
    outputConstraintsMap.remove(forEach);
    LogicalPlan plan = forEach.getInnerPlan();
    boolean ableToHandle = true;
    List<Integer> cols = new ArrayList<Integer>();
    boolean cast = false;

    if (outputConstraints == null || outputConstraints.size() == 0)
        // we dont have to do anything in this case
        return;


    Operator op = plan.getSinks().get(0);
    if (op instanceof CastExpression) {
            cast = true;
            op = ((CastExpression) op).getExpression();
        }

        if (!(op instanceof ProjectExpression)) {
            ableToHandle = false;
        } else {
            cols.add(Integer.valueOf(((ProjectExpression) op).getColNum()));
        }

    if (ableToHandle) {
        // we can only handle simple projections
        DataBag output = BagFactory.getInstance().newDefaultBag();
        for (Iterator<Tuple> it = outputConstraints.iterator(); it
                .hasNext();) {
            Tuple outputConstraint = it.next();
            try {
                Tuple inputConstraint = BackPropConstraint(
                        outputConstraint, cols, ((LogicalRelationalOperator)plan
                                .getPredecessors(forEach).get(0))
                                .getSchema(), cast);
                output.add(inputConstraint);
            } catch (Exception e) {
                e.printStackTrace();
                throw new FrontendException(
                        "Operator error during Augmenting Phase in Example Generator "
                                + e.getMessage());
            }
        }
        outputConstraintsMap.put(plan.getPredecessors(forEach)
                .get(0), output);
    }

}
 
Example 17
Source File: PartitionSkewedKeysTez.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public Map<String, Object> exec(Tuple in) throws IOException {
    if (in == null || in.size() == 0) {
        return null;
    }

    int estimatedNumReducers = -1;
    boolean estimate_sample_quantile = PigMapReduce.sJobConfInternal.get().getBoolean
            (PigProcessor.ESTIMATE_PARALLELISM, false);
    if (estimate_sample_quantile) {
        int specifiedNumReducer = (Integer) in.get(0);
        DataBag samples = (DataBag) in.get(1);

        long totalSampleSize = 0;
        long totalInputRows = 0;
        Iterator<Tuple> iter = samples.iterator();
        while (iter.hasNext()) {
            Tuple t = iter.next();
            totalInputRows += (Long)t.get(t.size() - 1);
            totalSampleSize += getMemorySize(t);
        }
        long totalSampleCount_ = samples.size();

        long estimatedInputSize = (long)((double)totalSampleSize/totalSampleCount_ * totalInputRows);

        long bytesPerTask = PigMapReduce.sJobConfInternal.get().getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
                InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER);

        estimatedNumReducers = (int)Math.ceil((double)estimatedInputSize/bytesPerTask);
        estimatedNumReducers = Math.min(estimatedNumReducers, InputSizeReducerEstimator.DEFAULT_MAX_REDUCER_COUNT_PARAM);

        LOG.info("Estimating parallelism: estimatedInputSize is " + estimatedInputSize + ". bytesPerTask is " + bytesPerTask + ". estimatedNumReducers is " + estimatedNumReducers + ".");

        this.totalReducers_ = estimatedNumReducers;
        LOG.info("Use estimated reducer instead:" + estimatedNumReducers + ", orig: " + specifiedNumReducer);
    }
    Map<String, Object> result = super.exec(in);
    if (estimate_sample_quantile) {
        result.put(PigProcessor.ESTIMATED_NUM_PARALLELISM, totalReducers_);
    }
    PigProcessor.sampleMap = result;
    return result;
}
 
Example 18
Source File: SimpleRandomSample.java    From datafu with Apache License 2.0 4 votes vote down vote up
@Override
public DataBag exec(Tuple input) throws IOException
{
  DataBag bag = (DataBag) input.get(0);

  boolean first = true;
  double p = 0.0d; // the sampling probability
  long n = 0L; // the size of the population (total number of items)

  DataBag selected = _BAG_FACTORY.newDefaultBag();
  DataBag waiting = _BAG_FACTORY.newSortedBag(ScoredTupleComparator.getInstance());

  for (Tuple tuple : bag)
  {
    if (first)
    {
      p = (Double) tuple.get(0);
      first = false;
    }

    n += (Long) tuple.get(1);
    selected.addAll((DataBag) tuple.get(3));
    waiting.addAll((DataBag) tuple.get(4));
  }

  long numSelected = selected.size();
  long numWaiting = waiting.size();

  long s = (long) Math.ceil(p * n); // sample size

  System.out.println("To sample " + s + " items from " + n + ", we pre-selected "
      + numSelected + ", and waitlisted " + waiting.size() + ".");

  long numNeeded = s - selected.size();

  if (numNeeded < 0)
  {
    System.err.println("Pre-selected " + numSelected + " items, but only needed " + s
        + ".");
  }

  for (Tuple scored : waiting)
  {
    if (numNeeded <= 0)
    {
      break;
    }
    selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple());
    numNeeded--;
  }

  if (numNeeded > 0)
  {
    System.err.println("The waiting list only has " + numWaiting
        + " items, but needed " + numNeeded + " more.");
  }

  return selected;
}
 
Example 19
Source File: MetricEvaluation.java    From spork with Apache License 2.0 4 votes vote down vote up
public static float getConciseness(
        Operator op,
        Map<Operator, DataBag> exampleData,
        Map<LogicalRelationalOperator, Collection<IdentityHashSet<Tuple>>> OperatorToEqClasses,
        boolean overallConciseness) {
    DataBag bag = exampleData.get(op);

    int noEqCl = OperatorToEqClasses.get(op).size();
    long noTuples = bag.size();

    float conciseness = 100 * ((float) noEqCl / (float) noTuples);
    if (!overallConciseness) {

        return ((conciseness > 100.0) ? 100.0f : conciseness);
    } else {

        noEqCl = 0;
        noTuples = 0;
        conciseness = 0;
        int noOperators = 0;

        for (Map.Entry<LogicalRelationalOperator, Collection<IdentityHashSet<Tuple>>> e : OperatorToEqClasses
                .entrySet()) {
            if (e.getKey().getAlias() == null)
                continue;
            noOperators++; // we need to keep a track of these and not use
                           // OperatorToEqClasses.size() as LORead shouldn't
                           // be considered a operator
            bag = exampleData.get(e.getKey());

            noTuples = bag.size();
            noEqCl = e.getValue().size();
            float concise = 100 * ((float) noEqCl / (float) noTuples);
            concise = (concise > 100) ? 100 : concise;
            conciseness += concise;
        }
        conciseness /= (float) noOperators;

        return conciseness;
    }

}
 
Example 20
Source File: Over.java    From spork with Apache License 2.0 4 votes vote down vote up
public void addAll(DataBag b) {
    tuples = new ArrayList<Tuple>((int)b.size());
    for (Tuple t : b) {
        tuples.add(t);
    }
}