org.apache.avro.mapred.Pair Java Examples

The following examples show how to use org.apache.avro.mapred.Pair. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RegressionTestLoglik.java    From ml-ease with Apache License 2.0 6 votes vote down vote up
@Override
public void reduce(Utf8 key,
                   Iterable<RegressionTestLoglikOutput> values,
                   AvroCollector<Pair<Utf8, RegressionTestLoglikOutput>> collector,
                   Reporter reporter) throws IOException
{
  double sumLoglik = 0;
  double n = 0;
  for (RegressionTestLoglikOutput value : values)
  {
    float loglik = value.testLoglik;
    sumLoglik += loglik;
    n += value.count;
  }
  RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
  output.key = "averageTestLoglik";
  output.testLoglik = (float) sumLoglik;
  output.count = n;
  collector.collect(new Pair<Utf8, RegressionTestLoglikOutput>(key, output));
}
 
Example #2
Source File: RegressionAdmmTrain.java    From ml-ease with Apache License 2.0 6 votes vote down vote up
@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<Integer, RegressionPrepareOutput>> collector,
                Reporter reporter) throws IOException
{
  Integer key = Integer.parseInt(data.key.toString());
  for (int i = 0; i < _lambdaRhoConsumer.get().size(); i++)
  {
    int newkey = key * _lambdaRhoConsumer.get().size() + i;
    // String newkey = String.valueOf(lambda)+"#"+key;
    data.key = String.valueOf(newkey);
    Pair<Integer, RegressionPrepareOutput> outPair =
        new Pair<Integer, RegressionPrepareOutput>(newkey, data);
    collector.collect(outPair);
  }
}
 
Example #3
Source File: ItemModelTestLoglik.java    From ml-ease with Apache License 2.0 6 votes vote down vote up
@Override
public void reduce(Utf8 key,
                   Iterable<RegressionTestLoglikOutput> values,
                   AvroCollector<Pair<Utf8, RegressionTestLoglikOutput>> collector,
                   Reporter reporter) throws IOException
{
  double sumLoglik = 0;
  double n = 0;
  for (RegressionTestLoglikOutput value : values)
  {
    float loglik = value.testLoglik;
    sumLoglik += loglik;
    n += value.count;
  }
  RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
  output.key = key;
  output.testLoglik = (float) sumLoglik;
  output.count = n;
  collector.collect(new Pair<Utf8, RegressionTestLoglikOutput>(key, output));
}
 
Example #4
Source File: MergeAvroMapper.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
@Override
protected void setup(Context context) throws InterruptedException, IOException {
  super.setup(context);
  Configuration conf = context.getConfiguration();
  final String userClassName = conf.get(MergeJob.MERGE_SQOOP_RECORD_KEY);
  try {
    final Class<? extends Object> clazz = Class.forName(userClassName, true,
        Thread.currentThread().getContextClassLoader());
    sqoopRecordImpl = (SqoopRecord) ReflectionUtils.newInstance(clazz, conf);
    for (final Field field : clazz.getDeclaredFields()) {
      final String fieldName = field.getName();
      final String fieldTypeName = field.getType().getName();
      sqoopRecordFields.put(fieldName.toLowerCase(), new Pair<String, String>(fieldName,
          fieldTypeName));
    }
  } catch (ClassNotFoundException e) {
    throw new IOException("Cannot find the user record class with class name"
        + userClassName, e);
  }
}
 
Example #5
Source File: PartitionIdAssigner.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<String, Integer>> collector,
                Reporter reporter) throws IOException
{
  String key = data.key.toString();
  for (float lambda : _lambdaSet)
  {
    String newkey = String.valueOf(lambda) + "#" + key;
    data.key = newkey;
    Pair<String, Integer> outPair = new Pair<String, Integer>(newkey, 1);
    collector.collect(outPair);
  }
}
 
Example #6
Source File: PartitionPreservingSchemas.java    From datafu with Apache License 2.0 5 votes vote down vote up
public Schema getMapOutputSchema()
{
  if (_mapOutputSchema == null)
  {
    _mapOutputSchema = Pair.getPairSchema(getMapOutputKeySchema(), 
                                          getMapOutputValueSchema());
  }
  return _mapOutputSchema;
}
 
Example #7
Source File: PartitionCollapsingSchemas.java    From datafu with Apache License 2.0 5 votes vote down vote up
public Schema getMapOutputSchema()
{
  if (_mapOutputSchema == null)
  {
    _mapOutputSchema = Pair.getPairSchema(getMapOutputKeySchema(), 
                                          getMapOutputValueSchema());
  }
  return _mapOutputSchema;
}
 
Example #8
Source File: ItemModelTrain.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<String, RegressionPrepareOutput>> collector,
                Reporter reporter) throws IOException
{
  String key = data.key.toString();
  Pair<String, RegressionPrepareOutput> outPair =
      new Pair<String, RegressionPrepareOutput>(key, data);
  collector.collect(outPair);
}
 
Example #9
Source File: RegressionTestLoglik.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<String, RegressionTestLoglikOutput>> collector,
                Reporter reporter) throws IOException
{
  int response = Util.getIntAvro(data, "response");
  double pred = Util.getDoubleAvro(data, "pred");
  double weight = 1;
  if (data.get("weight")!=null)
  {
    weight = Util.getDoubleAvro(data, "weight");
  }
  if (response != 1 && response != 0 && response != -1)
    throw new IOException("response should be 1,0 or -1!");
  double loglik = 0;
  if (response == 1)
  {
    loglik = -Math.log1p(Math.exp(-pred)) * weight;
  }
  else
  {
    loglik = -Math.log1p(Math.exp(pred)) * weight;
  }
  RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
  output.key = "loglik";
  output.testLoglik = (float) loglik;
  output.count = weight;
  collector.collect(new Pair<String, RegressionTestLoglikOutput>("loglik", output));
}
 
Example #10
Source File: PartitionIdAssigner.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void reduce(Utf8 key,
                   Iterable<Integer> values,
                   AvroCollector<Pair<String, Integer>> collector,
                   Reporter reporter) throws IOException
{
  collector.collect(new Pair<String, Integer>(key, 1));
}
 
Example #11
Source File: PartitionIdAssigner.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void reduce(Utf8 key,
                   Iterable<Integer> values,
                   AvroCollector<Pair<String, Integer>> collector,
                   Reporter reporter) throws IOException
{
  collector.collect(new Pair<String, Integer>(key, _partitionId));
  _partitionId++;
}
 
Example #12
Source File: ItemModelTestLoglik.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<String, RegressionTestLoglikOutput>> collector,
                Reporter reporter) throws IOException
{
  int response = Util.getIntAvro(data, "response");
  Map<Utf8, Float> pred = (Map<Utf8, Float>) data.get("pred");
  double weight = 1;
  if (data.get("weight")!=null)
  {
    weight = Util.getDoubleAvro(data, "weight");
  }
  if (response != 1 && response != 0 && response != -1)
  {
    throw new IOException("response should be 1,0 or -1!");
  }
  for (Utf8 k : pred.keySet())
  {
    double loglik = 0;
    if (response == 1)
    {
      loglik = -Math.log1p(Math.exp(-pred.get(k))) * weight;
    }
    else
    {
      loglik = -Math.log1p(Math.exp(pred.get(k))) * weight;
    }
    RegressionTestLoglikOutput output = new RegressionTestLoglikOutput();
    output.key = k;
    output.testLoglik = (float) loglik;
    output.count = weight;
    collector.collect(new Pair<String, RegressionTestLoglikOutput>(k.toString(), output));
  }
}
 
Example #13
Source File: ItemModelTestLoglik.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void run() throws Exception
{
  JobConfig props = super.getJobConfig();
  JobConf conf = super.createJobConf(ItemModelTestLoglikMapper.class,
                                     ItemModelTestLoglikReducer.class,
                                     ItemModelTestLoglikCombiner.class,
                                     Pair.getPairSchema(Schema.create(Type.STRING),
                                                        RegressionTestLoglikOutput.SCHEMA$),
                                                        RegressionTestLoglikOutput.SCHEMA$);
  AvroUtils.runAvroJob(conf);
}
 
Example #14
Source File: RegressionNaiveTrain.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void map(RegressionPrepareOutput data,
                AvroCollector<Pair<String, RegressionPrepareOutput>> collector,
                Reporter reporter) throws IOException
{
  String key = data.key.toString();
  for (float lambda : _lambdaSet)
  {
    String newkey = String.valueOf(lambda) + "#" + key;
    data.key = newkey;
    Pair<String, RegressionPrepareOutput> outPair =
        new Pair<String, RegressionPrepareOutput>(newkey, data);
    collector.collect(outPair);
  }
}
 
Example #15
Source File: RegressionTest.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("AdmmTestOutput",
                          "Test output for AdmmTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}
 
Example #16
Source File: RegressionTest.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<Float, GenericData.Record>> collector,
                Reporter reporter) throws IOException
{
  LinearModel model;
  if (_lambda >= 0)
  {
    model = _modelConsumer.get().get(String.valueOf(_lambda));
  }
  else
  {
    // lambda should be -1 and it should include only 1 model which is the best-model
    // found in train
    Iterator<LinearModel> iter = _modelConsumer.get().values().iterator();
    model = iter.next();
  }
  float pred = (float) model.evalInstanceAvro(data, false, _ignoreValue);
  GenericData.Record output = new GenericData.Record(_outputSchema);
  List<Schema.Field> inputFields = data.getSchema().getFields();
  for (Schema.Field field : inputFields)
  {
    output.put(field.name(), data.get(field.name()));
    _logger.info(field.name() + ": " + data.get(field.name()));
  }
  output.put("pred", pred);
  Pair<Float, GenericData.Record> outPair =
      new Pair<Float, GenericData.Record>(pred, output);
  collector.collect(outPair);
}
 
Example #17
Source File: ItemModelTest.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("PerItemTestOutput",
                          "Test output for PerItemTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.STRING), inputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}
 
Example #18
Source File: ItemModelTest.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
@Override
public void map(GenericData.Record data,
                AvroCollector<Pair<String, GenericData.Record>> collector,
                Reporter reporter) throws IOException
{
  if (data.get(_itemKey) == null)
  {
    throw new IOException("data does not contain the column" + _itemKey);
  }
  String itemKey = data.get(_itemKey).toString();
  collector.collect(new Pair<String, GenericData.Record>(itemKey, data));
}
 
Example #19
Source File: MergeAvroMapper.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private SqoopRecord toSqoopRecord(GenericRecord genericRecord) throws IOException {
  Schema avroSchema = genericRecord.getSchema();
  for (Schema.Field field : avroSchema.getFields()) {
    Pair<String, String> sqoopRecordField = sqoopRecordFields.get(field.name().toLowerCase());
    if (null == sqoopRecordField) {
      throw new IOException("Cannot find field '" + field.name() + "' in fields of user class"
          + sqoopRecordImpl.getClass().getName() + ". Fields are: "
          + Arrays.deepToString(sqoopRecordFields.values().toArray()));
    }
    Object avroObject = genericRecord.get(field.name());
    Object fieldVal = AvroUtil.fromAvro(avroObject, field.schema(), sqoopRecordField.value());
    sqoopRecordImpl.setField(sqoopRecordField.key(), fieldVal);
  }
  return sqoopRecordImpl;
}
 
Example #20
Source File: ItemModelTrain.java    From ml-ease with Apache License 2.0 4 votes vote down vote up
@Override
public void run() throws Exception
{
  JobConfig props = super.getJobConfig();
  _logger.info("Start training per-key naive logistic regression model...");
  String outBasePath = props.getString(OUTPUT_MODEL_PATH);
  String outpath = outBasePath + "/models";
  props.put("output.path", outpath);
  JobConf conf =
      createJobConf(ItemModelTrainMapper.class,
                    ItemModelTrainReducer.class,
                    Pair.getPairSchema(Schema.create(Type.STRING),
                                       RegressionPrepareOutput.SCHEMA$),
                    LinearModelWithVarAvro.SCHEMA$);
  // set up conf
  String interceptPriorMeanMap = props.getString(INTERCEPT_PRIOR_MEAN_MAP,"");
  if (!interceptPriorMeanMap.equals(""))
  {
    AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(interceptPriorMeanMap), INTERCEPT_PRIOR_MEAN_MAP);
  }
  String lambdaMap = props.getString(LAMBDA_MAP,"");
  if (!lambdaMap.equals(""))
  {
    AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(lambdaMap), LAMBDA_MAP);
  }
  conf.setFloat(INTERCEPT_DEFAULT_PRIOR_MEAN, (float)props.getDouble(INTERCEPT_DEFAULT_PRIOR_MEAN,0));
  conf.set(INTERCEPT_LAMBDAS,props.get(INTERCEPT_LAMBDAS));
  conf.set(DEFAULT_LAMBDAS,props.get(DEFAULT_LAMBDAS));
  conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
  conf.setFloat(LIBLINEAR_EPSILON, (float) props.getDouble(LIBLINEAR_EPSILON, 0.001f));
  conf.setBoolean(COMPUTE_VAR, props.getBoolean(COMPUTE_VAR,false));
  conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
  conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));
  // run job
  AvroUtils.runAvroJob(conf);
  boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true);
  if (removeTmpDir)
  {
    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(outBasePath + "/tmp-data"), true);
  }
}
 
Example #21
Source File: ItemModelTrain.java    From ml-ease with Apache License 2.0 4 votes vote down vote up
@Override
public void consume(Object object)
{
  Pair record = (Pair) object;
  _result.put(record.key().toString(), Double.parseDouble(record.value().toString()));
}