Java Code Examples for org.apache.pig.impl.util.ObjectSerializer#deserialize()

The following examples show how to use org.apache.pig.impl.util.ObjectSerializer#deserialize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FixedWidthLoader.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
    // Save reader to use in getNext()
    this.reader = reader;

    splitIndex = split.getSplitIndex();

    // Get schema from front-end
    UDFContext udfc = UDFContext.getUDFContext();
    Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfContextSignature });

    String strSchema = p.getProperty(SCHEMA_SIGNATURE);
    if (strSchema == null) {
        throw new IOException("Could not find schema in UDF context");
    }
    schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));

    requiredFields = (boolean[]) ObjectSerializer.deserialize(p.getProperty(REQUIRED_FIELDS_SIGNATURE));
    if (requiredFields != null) {
        numRequiredFields = 0;
        for (int i = 0; i < requiredFields.length; i++) {
            if (requiredFields[i])
                numRequiredFields++;
        }
    }
}
 
Example 2
Source File: HBaseStorage.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public void setStoreLocation(String location, Job job) throws IOException {
    if (location.startsWith("hbase://")){
        job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location.substring(8));
    }else{
        job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, location);
    }

    String serializedSchema = getUDFProperties().getProperty(contextSignature + "_schema");
    if (serializedSchema!= null) {
        schema_ = (ResourceSchema) ObjectSerializer.deserialize(serializedSchema);
    }

    m_conf = initializeLocalJobConfig(job);
    // Not setting a udf property and getting the hbase delegation token
    // only once like in setLocation as setStoreLocation gets different Job
    // objects for each call and the last Job passed is the one that is
    // launched. So we end up getting multiple hbase delegation tokens.
    addHBaseDelegationToken(m_conf, job);
}
 
Example 3
Source File: PhoenixHBaseStorage.java    From phoenix with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Parse the HBase table name and configure job
 */
@Override
public void setStoreLocation(String location, Job job) throws IOException {
	String prefix = "hbase://";
	if (location.startsWith(prefix)) {
		tableName = location.substring(prefix.length());
	}
	config = new PhoenixPigConfiguration(job.getConfiguration());
	config.configure(server, tableName, batchSize);

	String serializedSchema = getUDFProperties().getProperty(contextSignature + SCHEMA);
	if (serializedSchema != null) {
		schema = (ResourceSchema) ObjectSerializer.deserialize(serializedSchema);
	}
}
 
Example 4
Source File: PhoenixHBaseLoader.java    From phoenix with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
    this.reader = reader;
    final String resourceSchemaAsStr = getValueFromUDFContext(this.contextSignature,RESOURCE_SCHEMA_SIGNATURE);
    if (resourceSchemaAsStr == null) {
        throw new IOException("Could not find schema in UDF context");
    }
   schema = (ResourceSchema)ObjectSerializer.deserialize(resourceSchemaAsStr); 
}
 
Example 5
Source File: TestPruneColumn.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple getNext() throws IOException {
    if (aliases==null) {
        aliases = (String[])ObjectSerializer.deserialize(UDFContext.getUDFContext().getUDFProperties(this.getClass()).getProperty(signature));
        Tuple t = TupleFactory.getInstance().newTuple();
        for (String s : aliases)
            t.append(s);
        return t;
    }
    return null;
}
 
Example 6
Source File: PigSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static RequiredFieldList deserializeRequiredFieldList(String requiredFieldString) {
  if(requiredFieldString == null) {
      return null;
  }

  try {
    return (RequiredFieldList) ObjectSerializer.deserialize(requiredFieldString);
  } catch (IOException e) {
    throw new RuntimeException("Failed to deserialize pushProjection", e);
  }
}
 
Example 7
Source File: PigBytesRawComparator.java    From spork with Apache License 2.0 5 votes vote down vote up
public void setConf(Configuration conf) {
    try {
        mAsc = (boolean[])ObjectSerializer.deserialize(conf.get(
            "pig.sortOrder"));
    } catch (IOException ioe) {
        mLog.error("Unable to deserialize pig.sortOrder " +
            ioe.getMessage());
        throw new RuntimeException(ioe);
    }
    if (mAsc == null) {
        mAsc = new boolean[1];
        mAsc[0] = true;
    }
    ((BinInterSedes.BinInterSedesTupleRawComparator)mWrappedComp).setConf(conf);
}
 
Example 8
Source File: PigFloatRawComparator.java    From spork with Apache License 2.0 5 votes vote down vote up
public void setConf(Configuration conf) {
    try {
        mAsc = (boolean[])ObjectSerializer.deserialize(conf.get(
            "pig.sortOrder"));
    } catch (IOException ioe) {
        mLog.error("Unable to deserialize pig.sortOrder " +
            ioe.getMessage());
        throw new RuntimeException(ioe);
    }
    if (mAsc == null) {
        mAsc = new boolean[1];
        mAsc[0] = true;
    }
}
 
Example 9
Source File: SchemaTuple.java    From spork with Apache License 2.0 5 votes vote down vote up
protected static Schema staticSchemaGen(String s) {
    try {
        if (s.equals("")) {
            Log.warn("No Schema present in SchemaTuple generated class");
            return new Schema();
        }
        return (Schema) ObjectSerializer.deserialize(s);
    } catch (IOException e) {
        throw new RuntimeException("Unable to deserialize serialized Schema: " + s, e);
    }
}
 
Example 10
Source File: OrcStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private TypeInfo getTypeInfo(String location, Job job) throws IOException {
    Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
    TypeInfo typeInfo = (TypeInfo) ObjectSerializer.deserialize(p.getProperty(signature + SchemaSignatureSuffix));
    if (typeInfo == null) {
        typeInfo = getTypeInfoFromLocation(location, job);
    }
    if (typeInfo != null) {
        p.setProperty(signature + SchemaSignatureSuffix, ObjectSerializer.serialize(typeInfo));
    }
    return typeInfo;
}
 
Example 11
Source File: OrcStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public void setLocation(String location, Job job) throws IOException {
    Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
    if (!UDFContext.getUDFContext().isFrontend()) {
        typeInfo = (TypeInfo)ObjectSerializer.deserialize(p.getProperty(signature + SchemaSignatureSuffix));
    } else if (typeInfo == null) {
        typeInfo = getTypeInfo(location, job);
    }
    if (typeInfo != null && oi == null) {
        oi = OrcStruct.createObjectInspector(typeInfo);
    }
    if (!UDFContext.getUDFContext().isFrontend()) {
        if (p.getProperty(signature + RequiredColumnsSuffix) != null) {
            mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p
                    .getProperty(signature + RequiredColumnsSuffix));
            job.getConfiguration().setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false);
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR,
                    getReqiredColumnIdString(mRequiredColumns));
            if (p.getProperty(signature + SearchArgsSuffix) != null) {
                // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
                job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                        getReqiredColumnNamesString(getSchema(location, job), mRequiredColumns));
            }
        } else if (p.getProperty(signature + SearchArgsSuffix) != null) {
            // Bug in setSearchArgument which always expects READ_COLUMN_NAMES_CONF_STR to be set
            job.getConfiguration().set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR,
                    getReqiredColumnNamesString(getSchema(location, job)));
        }
        if (p.getProperty(signature + SearchArgsSuffix) != null) {
            job.getConfiguration().set(SARG_PUSHDOWN, p.getProperty(signature + SearchArgsSuffix));
        }

    }
    FileInputFormat.setInputPaths(job, location);
}
 
Example 12
Source File: PigInputFormat.java    From spork with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public org.apache.hadoop.mapreduce.RecordReader<Text, Tuple> createRecordReader(
        org.apache.hadoop.mapreduce.InputSplit split,
        TaskAttemptContext context) throws IOException,
        InterruptedException {
    // We need to create a TaskAttemptContext based on the Configuration which
    // was used in the getSplits() to produce the split supplied here. For
    // this, let's find out the input of the script which produced the split
    // supplied here and then get the corresponding Configuration and setup
    // TaskAttemptContext based on it and then call the real InputFormat's
    // createRecordReader() method

    PigSplit pigSplit = (PigSplit)split;
    activeSplit = pigSplit;
    // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
    // passing total # of splits to each split so it can be retrieved
    // here and set it to the configuration object. This number is needed
    // by PoissonSampleLoader to compute the number of samples
    int n = pigSplit.getTotalSplits();
    context.getConfiguration().setInt("pig.mapsplits.count", n);
    Configuration conf = context.getConfiguration();
    PigContext.setPackageImportList((ArrayList<String>) ObjectSerializer
            .deserialize(conf.get("udf.import.list")));
    MapRedUtil.setupUDFContext(conf);
    LoadFunc loadFunc = getLoadFunc(pigSplit.getInputIndex(), conf);
    // Pass loader signature to LoadFunc and to InputFormat through
    // the conf
    passLoadSignature(loadFunc, pigSplit.getInputIndex(), conf);

    // merge entries from split specific conf into the conf we got
    PigInputFormat.mergeSplitSpecificConf(loadFunc, pigSplit, conf);

    // for backward compatibility
    PigInputFormat.sJob = conf;

    InputFormat inputFormat = loadFunc.getInputFormat();

    List<Long> inpLimitLists =
            (ArrayList<Long>)ObjectSerializer.deserialize(
                    conf.get("pig.inpLimits"));

    return new PigRecordReader(inputFormat, pigSplit, loadFunc, context, inpLimitLists.get(pigSplit.getInputIndex()));
}
 
Example 13
Source File: PigGenericMapReduce.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
 * Configures the Reduce plan, the POPackage operator
 * and the reporter thread
 */
@SuppressWarnings("unchecked")
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    inIllustrator = inIllustrator(context);
    if (inIllustrator)
        pack = getPack(context);
    Configuration jConf = context.getConfiguration();
    SpillableMemoryManager.configure(ConfigurationUtil.toProperties(jConf));
    context.getConfiguration().set(PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
    sJobContext = context;
    sJobConfInternal.set(context.getConfiguration());
    sJobConf = context.getConfiguration();
    try {
        PigContext.setPackageImportList((ArrayList<String>)ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext)ObjectSerializer.deserialize(jConf.get("pig.pigContext"));

        // This attempts to fetch all of the generated code from the distributed cache, and resolve it
        SchemaTupleBackend.initialize(jConf, pigContext);

        if (rp == null)
            rp = (PhysicalPlan) ObjectSerializer.deserialize(jConf
                    .get("pig.reducePlan"));
        stores = PlanHelper.getPhysicalOperators(rp, POStore.class);

        if (!inIllustrator)
            pack = (POPackage)ObjectSerializer.deserialize(jConf.get("pig.reduce.package"));
        // To be removed
        if(rp.isEmpty())
            log.debug("Reduce Plan empty!");
        else{
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            rp.explain(baos);
            log.debug(baos.toString());
        }
        pigReporter = new ProgressableReporter();
        if(!(rp.isEmpty())) {
            roots = rp.getRoots().toArray(new PhysicalOperator[1]);
            leaf = rp.getLeaves().get(0);
        }

        // Get the UDF specific context
        MapRedUtil.setupUDFContext(jConf);

    } catch (IOException ioe) {
        String msg = "Problem while configuring reduce plan.";
        throw new RuntimeException(msg, ioe);
    }

    log.info("Aliases being processed per job phase (AliasName[line,offset]): " + jConf.get("pig.alias.location"));

    Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
}
 
Example 14
Source File: AvroStorage.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
 * Set input location and obtain input schema.
 */
@SuppressWarnings("unchecked")
@Override
public void setLocation(String location, Job job) throws IOException {
    if (inputAvroSchema != null) {
        return;
    }

    if (!UDFContext.getUDFContext().isFrontend()) {
        Properties udfProps = getUDFProperties();
        String mergedSchema = udfProps.getProperty(AVRO_MERGED_SCHEMA_PROPERTY);
        if (mergedSchema != null) {
            HashMap<URI, Map<Integer, Integer>> mergedSchemaMap =
                    (HashMap<URI, Map<Integer, Integer>>) ObjectSerializer.deserialize(mergedSchema);
            schemaToMergedSchemaMap = new HashMap<Path, Map<Integer, Integer>>();
            for (Entry<URI, Map<Integer, Integer>> entry : mergedSchemaMap.entrySet()) {
                schemaToMergedSchemaMap.put(new Path(entry.getKey()), entry.getValue());
            }
        }
        String schema = udfProps.getProperty(AVRO_INPUT_SCHEMA_PROPERTY);
        if (schema != null) {
            try {
                inputAvroSchema = new Schema.Parser().parse(schema);
                return;
            } catch (Exception e) {
                // Cases like testMultipleSchemas2 cause exception while deserializing
                // symbols. In that case, we get it again.
                LOG.warn("Exception while trying to deserialize schema in backend. " +
                        "Will construct again. schema= " + schema, e);
            }
        }
    }

    Configuration conf = job.getConfiguration();
    Set<Path> paths = AvroStorageUtils.getPaths(location, conf, true);
    if (!paths.isEmpty()) {
        // Set top level directories in input format. Adding all files will
        // bloat configuration size
        FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
        // Scan all directories including sub directories for schema
        if (inputAvroSchema == null) {
            setInputAvroSchema(paths, conf);
        }
    } else {
        throw new IOException("Input path \'" + location + "\' is not found");
    }

}
 
Example 15
Source File: PigCombiner.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
 * Configures the Reduce plan, the POPackage operator
 * and the reporter thread
 */
@SuppressWarnings("unchecked")
@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration jConf = context.getConfiguration();
    try {
        PigContext.setPackageImportList((ArrayList<String>)ObjectSerializer.deserialize(jConf.get("udf.import.list")));
        pigContext = (PigContext)ObjectSerializer.deserialize(jConf.get("pig.pigContext"));
        if (pigContext.getLog4jProperties()!=null)
            PropertyConfigurator.configure(pigContext.getLog4jProperties());

        cp = (PhysicalPlan) ObjectSerializer.deserialize(jConf
                .get("pig.combinePlan"));
        pack = (POPackage)ObjectSerializer.deserialize(jConf.get("pig.combine.package"));
        // To be removed
        if(cp.isEmpty())
            log.debug("Combine Plan empty!");
        else{
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            cp.explain(baos);
            log.debug(baos.toString());
        }

        keyType = ((byte[])ObjectSerializer.deserialize(jConf.get("pig.map.keytype")))[0];
        // till here

        pigReporter = new ProgressableReporter();
        if(!(cp.isEmpty())) {
            roots = cp.getRoots().toArray(new PhysicalOperator[1]);
            leaf = cp.getLeaves().get(0);
        }
    } catch (IOException ioe) {
        String msg = "Problem while configuring combiner's reduce plan.";
        throw new RuntimeException(msg, ioe);
    }

    // Avoid log spamming
    if (firstTime) {
        log.info("Aliases being processed per job phase (AliasName[line,offset]): " + jConf.get("pig.alias.location"));
        firstTime = false;
    }
}
 
Example 16
Source File: PigGenericMapBase.java    From spork with Apache License 2.0 4 votes vote down vote up
/**
 * Configures the mapper with the map plan and the
 * reproter thread
 */
@SuppressWarnings("unchecked")
@Override
public void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);

    Configuration job = context.getConfiguration();
    SpillableMemoryManager.configure(ConfigurationUtil.toProperties(job));
    context.getConfiguration().set(PigConstants.TASK_INDEX, Integer.toString(context.getTaskAttemptID().getTaskID().getId()));
    PigMapReduce.sJobContext = context;
    PigMapReduce.sJobConfInternal.set(context.getConfiguration());
    PigMapReduce.sJobConf = context.getConfiguration();
    inIllustrator = inIllustrator(context);

    PigContext.setPackageImportList((ArrayList<String>)ObjectSerializer.deserialize(job.get("udf.import.list")));
    pigContext = (PigContext)ObjectSerializer.deserialize(job.get("pig.pigContext"));

    // This attempts to fetch all of the generated code from the distributed cache, and resolve it
    SchemaTupleBackend.initialize(job, pigContext);

    if (pigContext.getLog4jProperties()!=null)
        PropertyConfigurator.configure(pigContext.getLog4jProperties());

    if (mp == null)
        mp = (PhysicalPlan) ObjectSerializer.deserialize(
            job.get("pig.mapPlan"));
    stores = PlanHelper.getPhysicalOperators(mp, POStore.class);

    // To be removed
    if(mp.isEmpty())
        log.debug("Map Plan empty!");
    else{
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        mp.explain(baos);
        log.debug(baos.toString());
    }
    keyType = ((byte[])ObjectSerializer.deserialize(job.get("pig.map.keytype")))[0];
    // till here

    pigReporter = new ProgressableReporter();
    // Get the UDF specific context
    MapRedUtil.setupUDFContext(job);

    if(!(mp.isEmpty())) {

        PigSplit split = (PigSplit)context.getInputSplit();
        List<OperatorKey> targetOpKeys = split.getTargetOps();

        ArrayList<PhysicalOperator> targetOpsAsList = new ArrayList<PhysicalOperator>();
        for (OperatorKey targetKey : targetOpKeys) {
            targetOpsAsList.add(mp.getOperator(targetKey));
        }
        roots = targetOpsAsList.toArray(new PhysicalOperator[1]);
        leaf = mp.getLeaves().get(0);
    }

    PigStatusReporter pigStatusReporter = PigStatusReporter.getInstance();
    pigStatusReporter.setContext(new MRTaskContext(context));

    log.info("Aliases being processed per job phase (AliasName[line,offset]): " + job.get("pig.alias.location"));

    Utils.setDefaultTimeZone(PigMapReduce.sJobConfInternal.get());
}
 
Example 17
Source File: AvroStorage.java    From Cubert with Apache License 2.0 4 votes vote down vote up
/**
 * Set input location and obtain input schema.
 */
@SuppressWarnings("unchecked")
@Override
public void setLocation(String location, Job job) throws IOException {
    if (inputAvroSchema != null) {
        return;
    }

    if (!UDFContext.getUDFContext().isFrontend()) {
        Properties udfProps = getUDFProperties();
        String mergedSchema = udfProps.getProperty(AVRO_MERGED_SCHEMA_PROPERTY);
        if (mergedSchema != null) {
            HashMap<URI, Map<Integer, Integer>> mergedSchemaMap =
                    (HashMap<URI, Map<Integer, Integer>>) ObjectSerializer.deserialize(mergedSchema);
            schemaToMergedSchemaMap = new HashMap<Path, Map<Integer, Integer>>();
            for (Entry<URI, Map<Integer, Integer>> entry : mergedSchemaMap.entrySet()) {
                schemaToMergedSchemaMap.put(new Path(entry.getKey()), entry.getValue());
            }
        }
        String schema = udfProps.getProperty(AVRO_INPUT_SCHEMA_PROPERTY);
        if (schema != null) {
            try {
                inputAvroSchema = new Schema.Parser().parse(schema);
                return;
            } catch (Exception e) {
                // Cases like testMultipleSchemas2 cause exception while deserializing
                // symbols. In that case, we get it again.
                LOG.warn("Exception while trying to deserialize schema in backend. " +
                        "Will construct again. schema= " + schema, e);
            }
        }
    }

    Configuration conf = job.getConfiguration();
    Set<Path> paths = AvroStorageUtils.getPaths(location, conf, true);
    if (!paths.isEmpty()) {
        // Set top level directories in input format. Adding all files will
        // bloat configuration size
        FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()]));
        // Scan all directories including sub directories for schema
        if (inputAvroSchema == null) {
            setInputAvroSchema(paths, conf);
        }
    } else {
        throw new IOException("Input path \'" + location + "\' is not found");
    }

}
 
Example 18
Source File: CSVLoader.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public Tuple getNext() throws IOException {
    mProtoTuple = new ArrayList<Object>();

    boolean inField = false;
    boolean inQuotedField = false;
    boolean evenQuotesSeen = true;
    
    if (!mRequiredColumnsInitialized) {
        if (signature != null) {
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
            mRequiredColumns = (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
        }
        mRequiredColumnsInitialized = true;
    }
    try {
        if (!in.nextKeyValue()) {
            return null;
        }                                                                                           
        Text value = (Text) in.getCurrentValue();
        byte[] buf = value.getBytes();
        int len = value.getLength();
        int fieldID = 0;

        ByteBuffer fieldBuffer = ByteBuffer.allocate(len);

        for (int i = 0; i < len; i++) {
            byte b = buf[i];
            inField = true;
            if (inQuotedField) {
                if (b == DOUBLE_QUOTE) {
                    evenQuotesSeen = !evenQuotesSeen;
                    if (evenQuotesSeen) {
                        fieldBuffer.put(DOUBLE_QUOTE);
                    }
                } else
                    if (!evenQuotesSeen &&
                            (b == FIELD_DEL || b == RECORD_DEL)) {
                        inQuotedField = false;
                        inField = false;
                        readField(fieldBuffer, fieldID++);
                    } else {
                        fieldBuffer.put(b);
                    }
            } else if (b == DOUBLE_QUOTE) {
                inQuotedField = true;
                evenQuotesSeen = true;
            } else if (b == FIELD_DEL) {
                inField = false;
                readField(fieldBuffer, fieldID++); // end of the field
            } else {
                evenQuotesSeen = true;
                fieldBuffer.put(b);
            }
        }
        if (inField) readField(fieldBuffer, fieldID++);
    } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, 
                PigException.REMOTE_ENVIRONMENT, e);
    }

    Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);
    return t;
}
 
Example 19
Source File: IcebergStorage.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
public <T extends Serializable> T getFromUDFContext(String key, Class<T> clazz) throws IOException {
  Properties properties = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[]{signature});

  return (T) ObjectSerializer.deserialize(properties.getProperty(key));
}
 
Example 20
Source File: IcebergPigInputFormat.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private boolean advance() throws IOException {
  if (reader != null) {
    reader.close();
  }

  if (!tasks.hasNext()) {
    return false;
  }

  FileScanTask currentTask = tasks.next();

  Schema tableSchema = (Schema) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_SCHEMA)));
  LOG.debug("[{}]: Task table schema: {}", signature, tableSchema);

  List<String> projectedFields =
      (List<String>) ObjectSerializer.deserialize(context.getConfiguration().get(scope(ICEBERG_PROJECTED_FIELDS)));
  LOG.debug("[{}]: Task projected fields: {}", signature, projectedFields);

  Schema projectedSchema = projectedFields != null ? SchemaUtil.project(tableSchema, projectedFields) : tableSchema;

  PartitionSpec spec = currentTask.asFileScanTask().spec();
  DataFile file = currentTask.file();
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());

  Set<Integer> idColumns = spec.identitySourceIds();

  // schema needed for the projection and filtering
  boolean hasJoinedPartitionColumns = !idColumns.isEmpty();

  switch (file.format()) {
    case PARQUET:
      Map<Integer, Object> partitionValueMap = Maps.newHashMap();

      if (hasJoinedPartitionColumns) {

        Schema readSchema = TypeUtil.selectNot(projectedSchema, idColumns);
        Schema projectedPartitionSchema = TypeUtil.select(projectedSchema, idColumns);

        Map<String, Integer> partitionSpecFieldIndexMap = Maps.newHashMap();
        for (int i = 0; i < spec.fields().size(); i++) {
          partitionSpecFieldIndexMap.put(spec.fields().get(i).name(), i);
        }

        for (Types.NestedField field : projectedPartitionSchema.columns()) {
          int partitionIndex = partitionSpecFieldIndexMap.get(field.name());

          Object partitionValue = file.partition().get(partitionIndex, Object.class);
          partitionValueMap.put(field.fieldId(), convertPartitionValue(field.type(), partitionValue));
        }

        reader = Parquet.read(inputFile)
            .project(readSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      } else {
        reader = Parquet.read(inputFile)
            .project(projectedSchema)
            .split(currentTask.start(), currentTask.length())
            .filter(currentTask.residual())
            .createReaderFunc(
                fileSchema -> PigParquetReader.buildReader(fileSchema, projectedSchema, partitionValueMap))
            .build();
      }

      recordIterator = reader.iterator();

      break;
    default:
      throw new UnsupportedOperationException("Unsupported file format: " + file.format());
  }

  return true;
}