/** * Copyright 2014 LinkedIn Corp. All rights reserved. * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package com.linkedin.mlease.regression.jobs; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.avro.generic.GenericData; import org.apache.avro.mapred.AvroCollector; import org.apache.avro.mapred.AvroMapper; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.Logger; import com.linkedin.mlease.regression.avro.RegressionPrepareOutput; import com.linkedin.mlease.regression.avro.feature; import com.linkedin.mlease.utils.Util; import com.linkedin.mapred.AbstractAvroJob; import com.linkedin.mapred.AvroUtils; import com.linkedin.mapred.JobConfig; /** * The preparation job for Regression, must run before running RegressionAdmmTrain or RegressionNaiveTrain etc. * */ public class RegressionPrepare extends AbstractAvroJob { public static final Logger _logger = Logger.getLogger(RegressionPrepare.class); public static final String MAP_KEY = "map.key"; public static final String NUM_BLOCKS = "num.blocks"; public static final String IGNORE_FEATURE_VALUE = "binary.feature"; public static final String NUM_CLICK_REPLICATES = "num.click.replicates"; public RegressionPrepare(String jobId, JobConfig config) { super(jobId, config); } public RegressionPrepare(JobConfig config) { super(config); } @Override public void run() throws Exception { JobConfig config = super.getJobConfig(); JobConf conf = super.createJobConf(RegressionPrepareMapper.class, RegressionPrepareOutput.SCHEMA$); String mapKey = config.getString(MAP_KEY, ""); conf.set(MAP_KEY, mapKey); conf.setInt(NUM_CLICK_REPLICATES, config.getInt(NUM_CLICK_REPLICATES, 1)); conf.setBoolean(IGNORE_FEATURE_VALUE, config.getBoolean(IGNORE_FEATURE_VALUE, false)); int nblocks = config.getInt(NUM_BLOCKS, 0); conf.setInt(NUM_BLOCKS, nblocks); _logger.info("Running the preparation job of admm with map.key = " + mapKey + " and num.blocks=" + nblocks); AvroUtils.runAvroJob(conf); } public static class RegressionPrepareMapper extends AvroMapper<GenericData.Record, RegressionPrepareOutput> { String _mapkey; int _nblocks; int _numClickReplicates; boolean _ignoreValue; @Override public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) { return; } _mapkey = conf.get(MAP_KEY, ""); _nblocks = conf.getInt(NUM_BLOCKS, 0); _logger.info("nblocks=" + _nblocks); _ignoreValue = conf.getBoolean(IGNORE_FEATURE_VALUE, false); _numClickReplicates = conf.getInt(NUM_CLICK_REPLICATES, 1); } @Override public void map(GenericData.Record data, AvroCollector<RegressionPrepareOutput> collector, Reporter reporter) throws IOException { String mapkey = ""; if (!_mapkey.equals("")) { if (data.get(_mapkey) == null) { throw new IOException("map.key is wrongly specified! No such key exists in some lines of the data!"); } mapkey = data.get(_mapkey).toString(); } else { // if not specified, generate the key by a random number mapkey = String.valueOf((int) Math.floor(Math.random() * _nblocks)); } RegressionPrepareOutput outData = new RegressionPrepareOutput(); outData.key = mapkey; // handle response int response = Util.getResponseAvro(data); outData.response = response; List<feature> newfeatures = new ArrayList<feature>(); // Make sure format in feature is correct Object temp = data.get("features"); if (temp == null) { throw new IOException("features is null"); } if (!(temp instanceof List)) { throw new IOException("features is not a list"); } List<?> features = (List<?>) temp; int m = features.size(); for (int i = 0; i < m; i++) { temp = features.get(i); if (!(temp instanceof GenericData.Record)) { throw new IOException("features[" + i + "] is not a record"); } GenericData.Record featureRecord = (GenericData.Record) temp; String name = Util.getStringAvro(featureRecord, "name", false); String term = Util.getStringAvro(featureRecord, "term", true); float Value = 1f; if (!_ignoreValue) { Value = (float) Util.getDoubleAvro(featureRecord, "value"); } feature newfeature = new feature(); newfeature.name = name; newfeature.term = term; newfeature.value = Value; newfeatures.add(newfeature); } outData.features = newfeatures; double weight = 1.0; if (data.get("weight") != null) { weight = Util.getDoubleAvro(data, "weight"); } if (Util.getIntAvro(data, "response") == 1) { weight = weight / _numClickReplicates; } outData.weight = (float) weight; double offset = 0.0; if (data.get("offset") != null) { offset = Util.getDoubleAvro(data, "offset"); } outData.offset = (float) offset; if (_mapkey.equals("") && response == 1) { // generate click replicates to get better consensus int partitionId = Integer.parseInt(mapkey); for (int i = 0; i < _numClickReplicates; i++) { if (partitionId >= _nblocks) { partitionId = partitionId - _nblocks; } outData.key = String.valueOf(partitionId); collector.collect(outData); partitionId++; } } else { collector.collect(outData); } } } }