Java Code Examples for org.apache.pig.ResourceSchema#getFields()

The following examples show how to use org.apache.pig.ResourceSchema#getFields() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: OrcStorage.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public List<String> getPredicateFields(String location, Job job) throws IOException {
    ResourceSchema schema = getSchema(location, job);
    List<String> predicateFields = new ArrayList<String>();
    for (ResourceFieldSchema field : schema.getFields()) {
        switch(field.getType()) {
        case DataType.BOOLEAN:
        case DataType.INTEGER:
        case DataType.LONG:
        case DataType.FLOAT:
        case DataType.DOUBLE:
        case DataType.DATETIME:
        case DataType.CHARARRAY:
        case DataType.BIGINTEGER:
        case DataType.BIGDECIMAL:
            predicateFields.add(field.getName());
            break;
        default:
            // Skip DataType.BYTEARRAY, DataType.TUPLE, DataType.MAP and DataType.BAG
            break;
        }
    }
    return predicateFields;
}
 
Example 2
Source File: TestResourceSchema.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * Test that ResourceSchema is correctly created given a
 * pig.Schema and vice versa 
 */
@Test
public void testResourceFlatSchemaCreation() 
throws ExecException, SchemaMergeException, FrontendException {
    String [] aliases ={"f1", "f2"};
    byte[] types = {DataType.CHARARRAY, DataType.INTEGER};
    Schema origSchema = TypeCheckingTestUtil.genFlatSchema(
            aliases,types);
    ResourceSchema rsSchema = new ResourceSchema(origSchema);
    assertEquals("num fields", aliases.length, rsSchema.getFields().length);
    ResourceSchema.ResourceFieldSchema[] fields = rsSchema.getFields();
    for (int i=0; i<fields.length; i++) {
        assertEquals(fields[i].getName(), aliases[i]);
        assertEquals(fields[i].getType(), types[i]);
    }
    Schema genSchema = Schema.getPigSchema(rsSchema);
    assertTrue("generated schema equals original", 
            Schema.equals(genSchema, origSchema, true, false));
}
 
Example 3
Source File: FixedWidthStorer.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public void prepareToWrite(RecordWriter writer) throws IOException {
    // Store writer to use in putNext()
    this.writer = writer;

    // Get the schema string from the UDFContext object.
    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(this.getClass(), new String[]{ udfContextSignature });
    String strSchema = p.getProperty(SCHEMA_SIGNATURE);
    if (strSchema == null) {
        throw new IOException("Could not find schema in UDF context");
    }

    schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
    fields = schema.getFields();
}
 
Example 4
Source File: PigSchema2Avro.java    From Cubert with Apache License 2.0 5 votes vote down vote up
/**
 * Convert a pig ResourceSchema to avro schema
 * 
 */
public static Schema convert(ResourceSchema pigSchema, boolean nullable) throws IOException {
    ResourceFieldSchema[] pigFields = pigSchema.getFields();

    /* remove the pig tuple wrapper */
    if (pigFields.length == 1) {

        AvroStorageLog.details("Ignore the pig tuple wrapper.");
        return convert(pigFields[0], nullable);
    } else
        return convertRecord(pigFields, nullable);
}
 
Example 5
Source File: JsonStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
public ResourceSchema fixSchema(ResourceSchema s){
  for (ResourceFieldSchema filed : s.getFields()) {
    if(filed.getType() == DataType.NULL)
      filed.setType(DataType.BYTEARRAY);
  }
  return s;
}
 
Example 6
Source File: OrcStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private String getReqiredColumnNamesString(ResourceSchema schema) {
    StringBuilder sb = new StringBuilder();
    for (ResourceFieldSchema field : schema.getFields()) {
        sb.append(field.getName()).append(",");
    }
    if(sb.charAt(sb.length() -1) == ',') {
        sb.deleteCharAt(sb.length() - 1);
    }
    return sb.toString();
}
 
Example 7
Source File: OrcStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private String getReqiredColumnNamesString(ResourceSchema schema, boolean[] requiredColumns) {
    StringBuilder sb = new StringBuilder();
    ResourceFieldSchema[] fields = schema.getFields();
    for (int i = 0; i < requiredColumns.length; i++) {
        if (requiredColumns[i]) {
            sb.append(fields[i]).append(",");
        }
    }
    if(sb.charAt(sb.length() - 1) == ',') {
        sb.deleteCharAt(sb.length() - 1);
    }
    return sb.toString();
}
 
Example 8
Source File: AvroStorageSchemaConversionUtilities.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * Translated a ResourceSchema to an Avro Schema.
 * @param rs Input schema.
 * @param recordName Record name
 * @param recordNameSpace Namespace
 * @param definedRecordNames Map of already defined record names
 * to schema objects
 * @return the translated schema
 * @throws IOException
 */
public static Schema resourceSchemaToAvroSchema(final ResourceSchema rs,
    String recordName, final String recordNameSpace,
    final Map<String, List<Schema>> definedRecordNames,
    final Boolean doubleColonsToDoubleUnderscores) throws IOException {

  if (rs == null) {
    return null;
  }

  recordName = toAvroName(recordName, doubleColonsToDoubleUnderscores);

  List<Schema.Field> fields = new ArrayList<Schema.Field>();
  Schema newSchema = Schema.createRecord(
      recordName, null, recordNameSpace, false);
  if (rs.getFields() != null) {
    Integer i = 0;
    for (ResourceSchema.ResourceFieldSchema rfs : rs.getFields()) {
      String rfsName = toAvroName(rfs.getName(),
          doubleColonsToDoubleUnderscores);
      Schema fieldSchema = resourceFieldSchemaToAvroSchema(
          rfsName, recordNameSpace, rfs.getType(),
          rfs.getDescription().equals("autogenerated from Pig Field Schema")
            ? null : rfs.getDescription(),
          rfs.getSchema(), definedRecordNames,
          doubleColonsToDoubleUnderscores);
      fields.add(new Schema.Field((rfsName != null)
            ? rfsName : recordName + "_" + i.toString(),
          fieldSchema,
          rfs.getDescription().equals(
              "autogenerated from Pig Field Schema")
              ? null : rfs.getDescription(), null));
      i++;

    }
    newSchema.setFields(fields);
  }

  return newSchema;
}
 
Example 9
Source File: Utils.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * This method adds FieldSchema of 'input source tag/path' as the first
 * field. This will be called only when PigStorage is invoked with
 * '-tagFile' or '-tagPath' option and the schema file is present to be
 * loaded.
 *
 * @param schema
 * @param fieldName
 * @return ResourceSchema
 */
public static ResourceSchema getSchemaWithInputSourceTag(ResourceSchema schema, String fieldName) {
    ResourceFieldSchema[] fieldSchemas = schema.getFields();
    ResourceFieldSchema sourceTagSchema = new ResourceFieldSchema(new FieldSchema(fieldName, DataType.CHARARRAY));
    ResourceFieldSchema[] fieldSchemasWithSourceTag = new ResourceFieldSchema[fieldSchemas.length + 1];
    fieldSchemasWithSourceTag[0] = sourceTagSchema;
    for(int j = 0; j < fieldSchemas.length; j++) {
        fieldSchemasWithSourceTag[j + 1] = fieldSchemas[j];
    }
    return schema.setFields(fieldSchemasWithSourceTag);
}
 
Example 10
Source File: Schema.java    From spork with Apache License 2.0 5 votes vote down vote up
public static Schema getPigSchema(ResourceSchema rSchema) 
throws FrontendException {
    if(rSchema == null) {
        return null;
    }
    List<FieldSchema> fsList = new ArrayList<FieldSchema>();
    for(ResourceFieldSchema rfs : rSchema.getFields()) {
        FieldSchema fs = new FieldSchema(rfs.getName(), 
                rfs.getSchema() == null ? 
                        null : getPigSchema(rfs.getSchema()), rfs.getType());
        
        if(rfs.getType() == DataType.BAG) {
            if (fs.schema != null) { // allow partial schema
                if (fs.schema.size() == 1) {
                    FieldSchema innerFs = fs.schema.getField(0);
                    if (innerFs.type != DataType.TUPLE) {
                        ResourceFieldSchema.throwInvalidSchemaException();
                    }
                } else {
                    ResourceFieldSchema.throwInvalidSchemaException();
                }
            } 
        }
        fsList.add(fs);
    }
    return new Schema(fsList);
}
 
Example 11
Source File: PigSchema2Avro.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * Convert a pig ResourceSchema to avro schema
 * 
 */
public static Schema convert(ResourceSchema pigSchema, boolean nullable) throws IOException {
    ResourceFieldSchema[] pigFields = pigSchema.getFields();

    /* remove the pig tuple wrapper */
    if (pigFields.length == 1) {

        AvroStorageLog.details("Ignore the pig tuple wrapper.");
        return convert(pigFields[0], nullable);
    } else
        return convertRecord(pigFields, nullable);
}
 
Example 12
Source File: FixedWidthLoader.java    From spork with Apache License 2.0 5 votes vote down vote up
public FixedWidthLoader(String columnSpec, String skipHeaderStr, String schemaStr) {
    try {
        columns = parseColumnSpec(columnSpec);
        schemaStr = schemaStr.replaceAll("[\\s\\r\\n]", "");
        schema = new ResourceSchema(Utils.getSchemaFromString(schemaStr));
        fields = schema.getFields();

        for (int i = 0; i < fields.length; i++) {
            byte fieldType = fields[i].getType();
            if (fieldType == DataType.MAP || fieldType == DataType.TUPLE || fieldType == DataType.BAG) {
                throw new IllegalArgumentException(
                    "Field \"" + fields[i].getName() + "\" is an object type (map, tuple, or bag). " + 
                    "Object types are not supported by FixedWidthLoader."
                );
            }
        }

        if (fields.length < columns.size())
            warn("More columns specified in column spec than fields specified in schema. Only loading fields specified in schema.",
                 PigWarning.UDF_WARNING_2);
        else if (fields.length > columns.size())
            throw new IllegalArgumentException("More fields specified in schema than columns specified in column spec.");
    } catch (ParserException e) {
        throw new IllegalArgumentException("Invalid schema format: " + e.getMessage());
    }

    if (skipHeaderStr.equalsIgnoreCase("SKIP_HEADER"))
        skipHeader = true;
}
 
Example 13
Source File: FixedWidthLoader.java    From spork with Apache License 2.0 5 votes vote down vote up
public FixedWidthLoader(String columnSpec) {
    try {
        columns = parseColumnSpec(columnSpec);
        String schemaStr = generateDefaultSchemaString();
        schema = new ResourceSchema(Utils.getSchemaFromString(schemaStr));
        fields = schema.getFields();
    } catch (ParserException e) {
        throw new IllegalArgumentException("Invalid schema format: " + e.getMessage());
    }
}
 
Example 14
Source File: PigBytesConverter.java    From elasticsearch-hadoop with Apache License 2.0 4 votes vote down vote up
@Override
public void convert(Object from, BytesArray to) {

    // expect PigTuple holding a Tuple with only one field - chararray or bytearray
    Assert.isTrue(from instanceof PigTuple,
            String.format("Unexpected object type, expecting [%s], given [%s]", PigTuple.class, from.getClass()));

    PigTuple pt = (PigTuple) from;
    ResourceFieldSchema schema = pt.getSchema();

    // unwrap the tuple
    ResourceSchema tupleSchema = schema.getSchema();

    // empty tuple shortcut
    if (tupleSchema == null) {
        // write empty doc
        to.bytes("{}");
        return;
    }

    ResourceFieldSchema[] fields = tupleSchema.getFields();
    Assert.isTrue(fields.length == 1, "When using JSON input, only one field is expected");

    Object object;
    byte type;

    try {
        object = pt.getTuple().get(0);
        type = pt.getTuple().getType(0);
    } catch (Exception ex) {
        throw new EsHadoopIllegalStateException("Encountered exception while processing tuple", ex);
    }


    if (type == DataType.BIGCHARARRAY || type == DataType.CHARARRAY) {
        to.bytes(object.toString());
        return;
    }
    if (type == DataType.BYTEARRAY) {
        DataByteArray dba = (DataByteArray) object;
        to.bytes(dba.get(), dba.size());
        return;
    }

    throw new EsHadoopIllegalArgumentException(String.format("Cannot handle Pig type [%s]; expecting [%s,%s]", object.getClass(), String.class, DataByteArray.class));
}
 
Example 15
Source File: AvroStorage.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
 * Get avro schema from "location" and return the converted
 * PigSchema.
 */
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {

    /* get avro schema */
    AvroStorageLog.funcCall("getSchema");
    if (inputAvroSchema == null) {
        Configuration conf = job.getConfiguration();
        // If within a script, you store to one location and read from same
        // location using AvroStorage getPaths will be empty. Since
        // getSchema is called during script parsing we don't want to fail
        // here if path not found

        Set<Path> paths = AvroStorageUtils.getPaths(location, conf, false);
        if (!paths.isEmpty()) {
            setInputAvroSchema(paths, conf);
        }
    }
    if(inputAvroSchema != null) {
        AvroStorageLog.details( "avro input schema:"  + inputAvroSchema);

        /* convert to pig schema */
        ResourceSchema pigSchema = AvroSchema2Pig.convert(inputAvroSchema);
        AvroStorageLog.details("pig input schema:" + pigSchema);
        if (pigSchema.getFields().length == 1){
            pigSchema = pigSchema.getFields()[0].getSchema();
        }
        Properties udfProps = getUDFProperties();
        udfProps.put(AVRO_INPUT_SCHEMA_PROPERTY, inputAvroSchema.toString());
        udfProps.put(AVRO_INPUT_PIG_SCHEMA_PROPERTY, pigSchema);
        if (schemaToMergedSchemaMap != null) {
            HashMap<URI, Map<Integer, Integer>> mergedSchemaMap = new HashMap<URI, Map<Integer, Integer>>();
            for (Entry<Path, Map<Integer, Integer>> entry : schemaToMergedSchemaMap.entrySet()) {
                //Path is not serializable
                mergedSchemaMap.put(entry.getKey().toUri(), entry.getValue());
            }
            udfProps.put(AVRO_MERGED_SCHEMA_PROPERTY,
                    ObjectSerializer.serialize(mergedSchemaMap));
        }

        return pigSchema;
    } else {
        return null;
    }
}
 
Example 16
Source File: AllLoader.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public ResourceSchema getSchema(String location, Job job)
        throws IOException {

    if (schema == null) {
        ResourceSchema foundSchema = jsonMetadata.getSchema(location, job);

        // determine schema from files in location
        if (foundSchema == null) {
            foundSchema = getSchemaFromLoadFunc(location, job);

        }

        // only add the partition keys if the schema is not null
        // we use the partitionKeySet to only set partition keys once.
        if (!(partitionKeysSet || foundSchema == null)) {
            String[] keys = getPartitionColumns(location, job);

            if (!(keys == null || keys.length == 0)) {

                // re-edit the pigSchema to contain the new partition keys.
                ResourceFieldSchema[] fields = foundSchema.getFields();

                LOG.debug("Schema: " + Arrays.toString(fields));

                ResourceFieldSchema[] newFields = Arrays.copyOf(fields,
                        fields.length + keys.length);

                int index = fields.length;

                for (String key : keys) {
                    newFields[index++] = new ResourceFieldSchema(
                            new FieldSchema(key, DataType.CHARARRAY));
                }

                foundSchema.setFields(newFields);

                LOG.debug("Added partition fields: " + keys
                        + " to loader schema");
                LOG.debug("Schema is: " + Arrays.toString(newFields));
            }

            partitionKeysSet = true;

        }

        schema = foundSchema;
    }

    return schema;
}
 
Example 17
Source File: AvroStorage.java    From Cubert with Apache License 2.0 4 votes vote down vote up
/**
 * Get avro schema from "location" and return the converted
 * PigSchema.
 */
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {

    /* get avro schema */
    AvroStorageLog.funcCall("getSchema");
    if (inputAvroSchema == null) {
        Configuration conf = job.getConfiguration();
        // If within a script, you store to one location and read from same
        // location using AvroStorage getPaths will be empty. Since
        // getSchema is called during script parsing we don't want to fail
        // here if path not found

        Set<Path> paths = AvroStorageUtils.getPaths(location, conf, false);
        if (!paths.isEmpty()) {
            setInputAvroSchema(paths, conf);
        }
    }
    if(inputAvroSchema != null) {
        AvroStorageLog.details( "avro input schema:"  + inputAvroSchema);

        /* convert to pig schema */
        ResourceSchema pigSchema = AvroSchema2Pig.convert(inputAvroSchema);
        AvroStorageLog.details("pig input schema:" + pigSchema);
        if (pigSchema.getFields().length == 1){
            pigSchema = pigSchema.getFields()[0].getSchema();
        }
        Properties udfProps = getUDFProperties();
        udfProps.put(AVRO_INPUT_SCHEMA_PROPERTY, inputAvroSchema.toString());
        udfProps.put(AVRO_INPUT_PIG_SCHEMA_PROPERTY, pigSchema);
        if (schemaToMergedSchemaMap != null) {
            HashMap<URI, Map<Integer, Integer>> mergedSchemaMap = new HashMap<URI, Map<Integer, Integer>>();
            for (Entry<Path, Map<Integer, Integer>> entry : schemaToMergedSchemaMap.entrySet()) {
                //Path is not serializable
                mergedSchemaMap.put(entry.getKey().toUri(), entry.getValue());
            }
            udfProps.put(AVRO_MERGED_SCHEMA_PROPERTY,
                    ObjectSerializer.serialize(mergedSchemaMap));
        }

        return pigSchema;
    } else {
        return null;
    }
}