Java Code Examples for org.apache.pig.impl.logicalLayer.schema.Schema#size()

The following examples show how to use org.apache.pig.impl.logicalLayer.schema.Schema#size() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SchemaUtils.java    From Cubert with Apache License 2.0 6 votes vote down vote up
public static BlockSchema convertToBlockSchema(Schema schema) throws FrontendException
{
    ColumnType[] ctypes = new ColumnType[schema.size()];
    for (int i = 0; i < ctypes.length; i++)
    {
        ColumnType ct = new ColumnType();
        FieldSchema fs = schema.getField(i);

        ct.setName(fs.alias);
        ct.setType(convertoRCFTypeName(DataType.findTypeName(fs.type)));
        if (fs.schema != null)
        {
            ct.setColumnSchema(convertToBlockSchema(fs.schema));
        }

        ctypes[i] = ct;
    }

    return new BlockSchema(ctypes);
}
 
Example 2
Source File: TypeCheckingExpVisitor.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * Checks to see if any field of the input schema is a byte array
 * @param func
 * @param s - input schema
 * @return true if found else false
 * @throws VisitorException
 */
private boolean byteArrayFound(UserFuncExpression func, Schema s) throws VisitorException {
    for(int i=0;i<s.size();i++){
        try {
            FieldSchema fs=s.getField(i);
            if(fs == null)
                return false;
            if(fs.type==DataType.BYTEARRAY){
                return true;
            }
        } catch (FrontendException fee) {
            int errCode = 1043;
            String msg = "Unable to retrieve field schema.";
            throw new TypeCheckerException(func, msg, errCode, PigException.INPUT, fee);
        }
    }
    return false;
}
 
Example 3
Source File: TupleFromBag.java    From datafu with Apache License 2.0 6 votes vote down vote up
@Override
public Schema outputSchema(Schema input)
{
	try {
		if (!(input.size() == 2 || input.size() == 3))
		{
			throw new RuntimeException("Expected input to have two or three fields");
		}

		if (input.getField(1).type != DataType.INTEGER ) {
			throw new RuntimeException("Expected an INT as second input, got: "+input.getField(1).type);
		}

		return new Schema(input.getField(0).schema);
	}

	catch (FrontendException e) {
		e.printStackTrace();
		throw new RuntimeException(e);
	}
}
 
Example 4
Source File: HyperLogLogPlusPlus.java    From datafu with Apache License 2.0 6 votes vote down vote up
@Override
public Schema outputSchema(Schema input)
{
  try {
    if (input.size() != 1)
    {
      throw new RuntimeException("Expected input to have only a single field");
    }

    Schema.FieldSchema inputFieldSchema = input.getField(0);

    if (inputFieldSchema.type != DataType.BAG)
    {
      throw new RuntimeException("Expected a BAG as input");
    }

    return new Schema(new Schema.FieldSchema(null, DataType.LONG));
  }
  catch (FrontendException e) {
    throw new RuntimeException(e);
  }
}
 
Example 5
Source File: JsFunction.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * converts a bag to javascript object based on a schema
 * @param bag the bag to convert
 * @param schema the schema to use for conversion
 * @param depth call depth used for debugging messages
 * @return the resulting javascript object
 * @throws FrontendException
 * @throws ExecException
 */
private Scriptable pigBagToJS(DataBag bag, Schema schema, int depth) throws FrontendException, ExecException {
    debugConvertPigToJS(depth, "Bag", bag, schema);
    if (schema.size() == 1 && schema.getField(0).type == DataType.TUPLE) {
        // unwrapping as bags always contain a tuple
        schema = schema.getField(0).schema;
    }
    Scriptable array = jsScriptEngine.jsNewArray(bag.size());
    array.setParentScope(jsScriptEngine.getScope());
    int i= 0;
    for (Tuple t : bag) {
        array.put(i++, array, pigTupleToJS(t, schema, depth + 1));
    }
    debugReturn(depth, array);
    return array;
}
 
Example 6
Source File: PigStreamingUDF.java    From spork with Apache License 2.0 6 votes vote down vote up
private Tuple deserializeTuple(FieldSchema fs, byte[] buf, int startIndex, int endIndex) throws IOException {
    Schema tupleSchema = fs.schema;
    
    ArrayList<Object> protoTuple = new ArrayList<Object>(tupleSchema.size());
    int depth = 0;
    int fieldNum = 0;
    int fieldStart = startIndex;
    

    for (int index = startIndex; index <= endIndex; index++) {
        depth = DELIMS.updateDepth(buf, depth, index);
        if (StreamingDelimiters.isDelimiter(DELIMS.getFieldDelim(), buf, index, depth, endIndex)) {
            protoTuple.add(deserialize(tupleSchema.getField(fieldNum), buf, fieldStart, index - 1));
            fieldStart = index + 3;
            fieldNum++;
        }
    }
    return tupleFactory.newTupleNoCopy(protoTuple);
}
 
Example 7
Source File: JsFunction.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public Object exec(Tuple tuple) throws IOException {
	Schema inputSchema = this.getInputSchema();
    if (LOG.isDebugEnabled()) {
        LOG.debug( "CALL " + stringify(outputSchema) + " " + functionName + " " + stringify(inputSchema));
    }
    // UDF always take a tuple: unwrapping when not necessary to simplify UDFs
    if (inputSchema.size() == 1 && inputSchema.getField(0).type == DataType.TUPLE) {
        inputSchema = inputSchema.getField(0).schema;
    }

    Scriptable params = pigTupleToJS(tuple, inputSchema, 0);

    Object[] passedParams = new Object[inputSchema.size()];
    for (int j = 0; j < passedParams.length; j++) {
        passedParams[j] = params.get(inputSchema.getField(j).alias, params);
    }

    Object result = jsScriptEngine.jsCall(functionName, passedParams);
    if (LOG.isDebugEnabled()) {
        LOG.debug( "call "+functionName+"("+Arrays.toString(passedParams)+") => "+toString(result));
    }

    // We wrap the result with an object in the following cases:
    //   1. Result is not an object type.
    //   2. OutputSchema is a tuple type. 
    if (!(result instanceof NativeObject) || outputSchema.getField(0).type == DataType.TUPLE) {
        Scriptable wrapper = jsScriptEngine.jsNewObject();
        wrapper.put(outputSchema.getField(0).alias, wrapper, result);
        result = wrapper;
    }
    Tuple evalTuple = jsToPigTuple((Scriptable)result, outputSchema, 0);
    Object eval = outputSchema.size() == 1 ? evalTuple.get(0) : evalTuple;
    LOG.debug(eval);
    return eval;
}
 
Example 8
Source File: TupleDiff.java    From datafu with Apache License 2.0 5 votes vote down vote up
@Override
public Schema outputSchema(Schema input) {
	if (input.size() < 2) {
		throw new RuntimeException("Expected input to have at least 2 fields, but has " + input.size());
	}

	return new Schema(new FieldSchema("tuplediff", DataType.CHARARRAY));
}
 
Example 9
Source File: BagToTuple.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public Schema outputSchema(Schema inputSchema) {
	try {
		if ((inputSchema == null) || inputSchema.size() != 1) {
			throw new RuntimeException("Expecting 1 input, found " + 
					((inputSchema == null) ? 0 : inputSchema.size()));
		}

		Schema.FieldSchema inputFieldSchema = inputSchema.getField(0);
		if (inputFieldSchema.type != DataType.BAG) {
			throw new RuntimeException("Expecting a bag of tuples: {()}");
		}

		// first field in the bag schema
		Schema.FieldSchema firstFieldSchema = inputFieldSchema.schema.getField(0);
		if ((firstFieldSchema == null) || (firstFieldSchema.schema == null)
				|| firstFieldSchema.schema.size() < 1) {
			throw new RuntimeException("Expecting a bag of tuples: {()}, found: " + inputSchema);
		}

		if (firstFieldSchema.type != DataType.TUPLE) {
			throw new RuntimeException("Expecting a bag of tuples: {()}, found: " + inputSchema);
		}

		// now for output schema
		Schema tupleOutputSchema = new Schema();
		for (int i = 0; i < firstFieldSchema.schema.size(); ++i) {
			tupleOutputSchema.add(firstFieldSchema.schema.getField(i));
		}
		return new Schema(new Schema.FieldSchema(getSchemaName(this
				.getClass().getName().toLowerCase(), inputSchema), tupleOutputSchema,
				DataType.TUPLE));
	} catch (FrontendException e) {
		e.printStackTrace();
		return null;
	}
}
 
Example 10
Source File: BagToString.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public Schema outputSchema(Schema inputSchema) {
	try {
		if ((inputSchema == null) || ((inputSchema.size() != 1) && (inputSchema.size() != 2))) {
			throw new RuntimeException("Expecting 2 inputs, found: " + 
					((inputSchema == null) ? 0 : inputSchema.size()));
		}

		FieldSchema inputFieldSchema = inputSchema.getField(0);
		if (inputFieldSchema.type != DataType.BAG) {
			throw new RuntimeException("Expecting a bag of tuples: {()}, found data type: " + 
					DataType.findTypeName(inputFieldSchema.type));
		}

		// first field in the bag schema
		FieldSchema firstFieldSchema = inputFieldSchema.schema.getField(0);
		if ((firstFieldSchema == null) || (firstFieldSchema.schema == null)
				|| firstFieldSchema.schema.size() < 1) {
			throw new RuntimeException("Expecting a bag and a delimeter, found: " + inputSchema);
		}

		if (firstFieldSchema.type != DataType.TUPLE) {
			throw new RuntimeException("Expecting a bag and a delimeter, found: " + inputSchema);
		}
		
		if (inputSchema.size() == 2) {
			FieldSchema secondInputFieldSchema = inputSchema.getField(1);

			if (secondInputFieldSchema.type != DataType.CHARARRAY) {
				throw new RuntimeException("Expecting a bag and a delimeter, found: " + inputSchema);
			}
		}

		return new Schema(new Schema.FieldSchema(null, DataType.CHARARRAY));
	} catch (FrontendException e) {
		e.printStackTrace();
		return null;
	}
}
 
Example 11
Source File: JsFunction.java    From spork with Apache License 2.0 5 votes vote down vote up
private DataBag jsToPigBag(Scriptable array, Schema schema, int depth) throws FrontendException, ExecException {
    debugConvertJSToPig(depth, "Bag", array, schema);
    if (schema.size() == 1 && schema.getField(0).type == DataType.TUPLE) {
        schema = schema.getField(0).schema;
    }
    List<Tuple> bag = new ArrayList<Tuple>();
    for (Object id : array.getIds()) {
        Scriptable arrayValue = (Scriptable)array.get(((Integer)id).intValue(), null);
        bag.add(jsToPigTuple(arrayValue, schema, depth + 1));
    }
    DataBag result = BagFactory.getInstance().newDefaultBag(bag);
    debugReturn(depth, result);
    return result;
}
 
Example 12
Source File: JsFunction.java    From spork with Apache License 2.0 5 votes vote down vote up
private Tuple jsToPigTuple(Scriptable object, Schema schema, int depth) throws FrontendException, ExecException {
    debugConvertJSToPig(depth, "Tuple", object, schema);
    Tuple t = TupleFactory.getInstance().newTuple(schema.size());
    for (int i = 0; i < schema.size(); i++) {
        FieldSchema field = schema.getField(i);
        if (object.has(field.alias, jsScriptEngine.getScope())) {
            Object attr = object.get(field.alias, object);
            Object value;
            if (field.type == DataType.BAG) {
                value = jsToPigBag((Scriptable)attr, field.schema, depth + 1);
            } else if (field.type == DataType.TUPLE) {
                value = jsToPigTuple((Scriptable)attr, field.schema, depth + 1);
            } else if (field.type == DataType.MAP) {
                value = jsToPigMap((Scriptable)attr, field.schema, depth + 1);
            } else if (attr instanceof NativeJavaObject) {
                value = ((NativeJavaObject)attr).unwrap();
            } else if (attr instanceof Undefined) {
                value = null;
            } else {
                value = attr;
            }
            t.set(i, value);
        } else {
            if (LOG.isDebugEnabled()) {
                LOG.debug("X( "+field.alias+" NOT FOUND");
            }
        }
    }
    debugReturn(depth, t);
    return t;
}
 
Example 13
Source File: SimpleEvalFunc.java    From datafu with Apache License 2.0 5 votes vote down vote up
/**
 * Override outputSchema so we can verify the input schema at pig compile time, instead of runtime
 * @param inputSchema input schema
 * @return call to super.outputSchema in case schema was defined elsewhere
 */
@Override
public Schema outputSchema(Schema inputSchema)
{
  if (inputSchema == null) {
    throw new IllegalArgumentException(String.format("%s: null schema passed to %s", _method_signature(), getClass().getName()));
  }

  // check correct number of arguments
  @SuppressWarnings("rawtypes")
  Class parameterTypes[] = m.getParameterTypes();
  if (inputSchema.size() != parameterTypes.length) {
    throw new IllegalArgumentException(String.format("%s: got %d arguments, expected %d.",
                                                     _method_signature(),
                                                     inputSchema.size(),
                                                     parameterTypes.length));
  }

  // check type for each argument
  for (int i=0; i < parameterTypes.length; i++) {
    try {
      byte inputType = inputSchema.getField(i).type;
      byte parameterType = DataType.findType(parameterTypes[i]);
      if (inputType != parameterType) {
        throw new IllegalArgumentException(String.format("%s: argument type mismatch [#%d]; expected %s, got %s",
                                                         _method_signature(),
                                                         i+1,
                                                         DataType.findTypeName(parameterType),
                                                         DataType.findTypeName(inputType)));
      }
    }
    catch (FrontendException fe) {
      throw new IllegalArgumentException(String.format("%s: Problem with input schema: ", _method_signature(), inputSchema), fe);
    }
  }

  // delegate to super to determine the actual outputSchema (if specified)
  return super.outputSchema(inputSchema);
}
 
Example 14
Source File: RubySchema.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * This method will fix any name conflicts in a schema. It's important to note that
 * this will change the Schema object itself. It will deal with any collisions in things
 * named tuple_#, bag_#, map_#, or val_#, as these are generally names generated by
 * Util.getSchemaFromString. In the case of another name conflict, it will not be
 * changed, as that name conflict was created by the user.
 *
 * @param s a Schema object to fix in place
 */
private static void fixSchemaNames(Schema s) {
    if (s == null)
         return;
    // This regex detects names that could possibly collide that we should change
    Pattern p = Pattern.compile("(bag_|tuple_|map_|val_)(\\d+)", Pattern.CASE_INSENSITIVE);
    Set<String> names = new HashSet<String>(s.size(), 1.0f);
    for (Schema.FieldSchema fs : s.getFields()) {
        if (fs.alias == null)
             continue;
        Matcher m = p.matcher(fs.alias);
        if (m.matches() && names.contains(fs.alias)) {
            String prefix = m.group(1);
            int suffix = Integer.parseInt(m.group(2));
            while (names.contains(prefix + suffix))
                suffix++;
            fs.alias = prefix + suffix;
        }
        names.add(fs.alias);
        if (fs.schema != null) {
            if (fs.type == DataType.BAG) {
                try {
                    fixSchemaNames(fs.schema.getField(0).schema);
                } catch (FrontendException e) {
                    throw new RuntimeException("Error recursively fixing schema: " + s, e);
                }
            } else {
                fixSchemaNames(fs.schema);
            }
        }
    }
}
 
Example 15
Source File: SummaryData.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
protected FieldSchema getField(Schema schema, int i) {
  try {
    if (schema == null || i >= schema.size()) {
      return null;
    }
    FieldSchema field = schema.getField(i);
    return field;
  } catch (FrontendException e) {
    throw new RuntimeException(e);
  }
}
 
Example 16
Source File: UnorderedPairs.java    From datafu with Apache License 2.0 5 votes vote down vote up
@Override
public Schema outputSchema(Schema input)
{
  try {
    if (input.size() != 1)
    {
      throw new RuntimeException("Expected input to have only a single field");
    }
    
    Schema.FieldSchema inputFieldSchema = input.getField(0);

    if (inputFieldSchema.type != DataType.BAG)
    {
      throw new RuntimeException("Expected a BAG as input");
    }
    
    Schema inputBagSchema = inputFieldSchema.schema;

    if (inputBagSchema.getField(0).type != DataType.TUPLE)
    {
      throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s",
                                               DataType.findTypeName(inputBagSchema.getField(0).type)));
    }      
    
    Schema ouputTupleSchema = new Schema();
    ouputTupleSchema.add(new Schema.FieldSchema("elem1", inputBagSchema.getField(0).schema.clone(), DataType.TUPLE));
    ouputTupleSchema.add(new Schema.FieldSchema("elem2", inputBagSchema.getField(0).schema.clone(), DataType.TUPLE));
    return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
                                             ouputTupleSchema, 
                                             DataType.BAG));
  }
  catch (Exception e) {
    return null;
  }
}
 
Example 17
Source File: UserFuncExpression.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public LogicalSchema.LogicalFieldSchema getFieldSchema() throws FrontendException {
    if (fieldSchema!=null)
        return fieldSchema;

    LogicalSchema inputSchema = new LogicalSchema();
    List<Operator> succs = plan.getSuccessors(this);

    if (succs!=null) {
        for(Operator lo : succs){
            if (((LogicalExpression)lo).getFieldSchema()==null) {
                inputSchema = null;
                break;
            }
            inputSchema.addField(((LogicalExpression)lo).getFieldSchema());
        }
    }

    if (lazilyInitializeInvokerFunction) {
        initializeInvokerFunction();
    }

    // Since ef only set one time, we never change its value, so we can optimize it by instantiate only once.
    // This significantly optimize the performance of frontend (PIG-1738)
    if (ef==null) {
        ef = (EvalFunc<?>) PigContext.instantiateFuncFromSpec(mFuncSpec);
    }

    ef.setUDFContextSignature(signature);
    Properties props = UDFContext.getUDFContext().getUDFProperties(ef.getClass());
    Schema translatedInputSchema = Util.translateSchema(inputSchema);
    if(translatedInputSchema != null) {
        props.put("pig.evalfunc.inputschema."+signature, translatedInputSchema);
    }
    // Store inputSchema into the UDF context
    ef.setInputSchema(translatedInputSchema);

    Schema udfSchema = ef.outputSchema(translatedInputSchema);
    if (udfSchema != null && udfSchema.size() > 1) {
        throw new FrontendException("Given UDF returns an improper Schema. Schema should only contain one field of a Tuple, Bag, or a single type. Returns: " + udfSchema);
    }

    //TODO appendability should come from a setting
    SchemaTupleFrontend.registerToGenerateIfPossible(translatedInputSchema, false, GenContext.UDF);
    SchemaTupleFrontend.registerToGenerateIfPossible(udfSchema, false, GenContext.UDF);

    if (udfSchema != null) {
        Schema.FieldSchema fs;
        if(udfSchema.size() == 0) {
            fs = new Schema.FieldSchema(null, null, DataType.findType(ef.getReturnType()));
        } else if(udfSchema.size() == 1) {
            fs = new Schema.FieldSchema(udfSchema.getField(0));
        } else {
            fs = new Schema.FieldSchema(null, udfSchema, DataType.TUPLE);
        }
        fieldSchema = Util.translateFieldSchema(fs);
        fieldSchema.normalize();
    } else {
        fieldSchema = new LogicalSchema.LogicalFieldSchema(null, null, DataType.findType(ef.getReturnType()));
    }
    uidOnlyFieldSchema = fieldSchema.mergeUid(uidOnlyFieldSchema);
    return fieldSchema;
}
 
Example 18
Source File: TypeCheckingExpVisitor.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
 * Computes a modified version of manhattan distance between
 * the two schemas: s1 & s2. Here the value on the same axis
 * are preferred over values that change axis as this means
 * that the number of casts required will be lesser on the same
 * axis.
 *
 * However, this function ceases to be a metric as the triangle
 * inequality does not hold.
 *
 * Each schema is an s1.size() dimensional vector.
 * The ordering for each axis is as defined by castLookup.
 * Unallowed casts are returned a dist of INFINITY.
 * @param s1
 * @param s2
 * @param s2Type
 * @return
 */
private long fitPossible(Schema s1, Schema s2, SchemaType s2Type) {
    if(s1==null || s2==null) return INF;
    List<FieldSchema> sFields = s1.getFields();
    List<FieldSchema> fsFields = s2.getFields();
    
    if((s2Type == SchemaType.NORMAL) && (sFields.size()!=fsFields.size()))
        return INF;
    if((s2Type == SchemaType.VARARG) && (sFields.size() < fsFields.size()))
        return INF;
    long score = 0;
    int castCnt=0;
    for(int i=0;i<sFields.size();i++){
        FieldSchema sFS = sFields.get(i);
        if(sFS == null){
            return INF;
        }

        // if we have a byte array do not include it
        // in the computation of the score - bytearray
        // fields will be looked at separately outside
        // of this function
        if (sFS.type == DataType.BYTEARRAY)
            continue;
        
        //if we get to the vararg field (if defined) : take it repeatedly
        FieldSchema fsFS = ((s2Type == SchemaType.VARARG) && i >= s2.size()) ? 
                fsFields.get(s2.size() - 1) : fsFields.get(i);

        if(DataType.isSchemaType(sFS.type)){
            if(!FieldSchema.equals(sFS, fsFS, false, true))
                return INF;
        }
        if(FieldSchema.equals(sFS, fsFS, true, true)) continue;
        if(!castLookup.containsKey(sFS.type))
            return INF;
        if(!(castLookup.get(sFS.type).contains(fsFS.type)))
            return INF;
        score += (castLookup.get(sFS.type)).indexOf(fsFS.type) + 1;
        ++castCnt;
    }
    return score * castCnt;
}
 
Example 19
Source File: TypeCheckingExpVisitor.java    From spork with Apache License 2.0 4 votes vote down vote up
/***************************************************************************
 * Compare two schemas for equality for argument matching purposes. This is
 * a more relaxed form of Schema.equals wherein first the Datatypes of the
 * field schema are checked for equality. Then if a field schema in the udf
 * schema is for a complex type AND if the inner schema is NOT null, check
 * for schema equality of the inner schemas of the UDF field schema and
 * input field schema
 *
 * @param inputSchema
 * @param udfSchema
 * @param ignoreByteArrays
 * @return true if FieldSchemas are equal for argument matching, false
 *         otherwise
 * @throws FrontendException
 */
public static boolean schemaEqualsForMatching(Schema inputSchema,
        Schema udfSchema, SchemaType udfSchemaType, boolean ignoreByteArrays) throws FrontendException {


    // If both of them are null, they are equal
    if ((inputSchema == null) && (udfSchema == null)) {
        return true;
    }

    // otherwise
    if (inputSchema == null) {
        return false;
    }

    if (udfSchema == null) {
        return false;
    }

    // the old udf schemas might not have tuple inside bag
    // fix that!
    udfSchema = Util.fixSchemaAddTupleInBag(udfSchema);

    if ((udfSchemaType == SchemaType.NORMAL) && (inputSchema.size() != udfSchema.size()))
        return false;
    if ((udfSchemaType == SchemaType.VARARG) && inputSchema.size() < udfSchema.size())
        return false;

    Iterator<FieldSchema> i = inputSchema.getFields().iterator();
    Iterator<FieldSchema> j = udfSchema.getFields().iterator();

    FieldSchema udfFieldSchema = null;
    while (i.hasNext()) {

        FieldSchema inputFieldSchema = i.next();
        if(inputFieldSchema == null)
            return false;

        //if there's no more UDF field: take the last one which is the vararg field
        udfFieldSchema = j.hasNext() ? j.next() : udfFieldSchema;
        
        if(ignoreByteArrays && inputFieldSchema.type == DataType.BYTEARRAY) {
            continue;
        }
        
        if (inputFieldSchema.type != udfFieldSchema.type) {
            return false;
        }

        // if a field schema in the udf schema is for a complex
        // type AND if the inner schema is NOT null, check for schema
        // equality of the inner schemas of the UDF field schema and
        // input field schema. If the field schema in the udf schema is
        // for a complex type AND if the inner schema IS null it means
        // the udf is applicable for all input which has the same type
        // for that field (irrespective of inner schema)
        // if it is a bag with empty tuple, then just rely on the field type
        if (DataType.isSchemaType(udfFieldSchema.type)
                && udfFieldSchema.schema != null
                && isNotBagWithEmptyTuple(udfFieldSchema)
        ) {
            // Compare recursively using field schema
            if (!FieldSchema.equals(inputFieldSchema, udfFieldSchema,
                    false, true)) {
                //try modifying any empty tuple to type of bytearray
                // and see if that matches. Need to do this for
                // backward compatibility -
                // User might have specified tuple with a bytearray
                // and this should also match an empty tuple

                FieldSchema inputFSWithBytearrayinTuple =
                    new FieldSchema(inputFieldSchema);

                convertEmptyTupleToBytearrayTuple(inputFSWithBytearrayinTuple);

                if (!FieldSchema.equals(inputFSWithBytearrayinTuple, udfFieldSchema,
                        false, true)) {
                    return false;
                }
            }
        }

    }
    return true;
}
 
Example 20
Source File: BagGroup.java    From datafu with Apache License 2.0 4 votes vote down vote up
@Override
public Schema getOutputSchema(Schema input)
{
  try {
    if (input.size() != 2) {
      throw new RuntimeException(String.format("Expected input of format (BAG, PROJECTED_BAG...). Got %d field.", input.size()));
    }
    // Expect the first field to be a bag
    FieldSchema bagFieldSchema = input.getField(0);
    if (bagFieldSchema.type != DataType.BAG) {
      throw new RuntimeException(String.format("Expected input of format (BAG, PROJECTED_BAG...). Got %s as first field.", DataType.findTypeName(bagFieldSchema.type)));
    }
    // Expect the second fields to be a projection of the bag
    FieldSchema projectedBagFieldSchema = input.getField(1);
    if (projectedBagFieldSchema.type != DataType.BAG) {
      throw new RuntimeException(String.format("Expected input of format (BAG, PROJECTED_BAG...). Got %s as second field.", DataType.findTypeName(projectedBagFieldSchema.type)));
    }

    String bagName = bagFieldSchema.alias;
    // handle named tuples
    if (bagFieldSchema.schema.size() == 1) {
      FieldSchema bagTupleFieldSchema = bagFieldSchema.schema.getField(0);
      if (bagTupleFieldSchema.type == DataType.TUPLE && bagTupleFieldSchema.alias != null) {
        bagName = getPrefixedAliasName(bagName, bagTupleFieldSchema.alias);
      }
    }
    if (projectedBagFieldSchema.schema.size() == 1) {
      FieldSchema projectedBagTupleFieldSchema = projectedBagFieldSchema.schema.getField(0);
      if (projectedBagTupleFieldSchema.type == DataType.TUPLE && projectedBagTupleFieldSchema.schema != null) {
        projectedBagFieldSchema = projectedBagTupleFieldSchema;
      }
    }

    // create the output schema for the 'group'
    // store the field names for the group keys
    Schema groupTupleSchema = new Schema();
    fieldNames = new ArrayList<String>(projectedBagFieldSchema.schema.size());
    for (int i=0; i<projectedBagFieldSchema.schema.size(); i++) {
      FieldSchema fieldSchema = projectedBagFieldSchema.schema.getField(i);
      String fieldName = fieldSchema.alias;
      fieldNames.add(getPrefixedAliasName(bagName, fieldName));
      groupTupleSchema.add(new FieldSchema(fieldSchema.alias, fieldSchema.type));
    }
    getInstanceProperties().put(FIELD_NAMES_PROPERTY, fieldNames);

    Schema outputTupleSchema = new Schema();
    if (projectedBagFieldSchema.schema.size() > 1) {
      // multiple group keys
      outputTupleSchema.add(new FieldSchema("group", groupTupleSchema, DataType.TUPLE));
    } else {
      // single group key
      outputTupleSchema.add(new FieldSchema("group", groupTupleSchema.getField(0).type));
    }
    outputTupleSchema.add(bagFieldSchema);

    return new Schema(new Schema.FieldSchema(
          getSchemaName(this.getClass().getName().toLowerCase(), input),
          outputTupleSchema,
          DataType.BAG));
  } catch (FrontendException e) {
    throw new RuntimeException(e);
  }
}