/* * Copyright 2018 Aerospike, Inc. * * Portions may be licensed to Aerospike, Inc. under one or more * contributor license agreements. * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package com.aerospike.hadoop.examples.generateprofiles; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import com.aerospike.client.AerospikeClient; import com.aerospike.client.Bin; import com.aerospike.client.Key; import com.aerospike.client.policy.WritePolicy; import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat; import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter; public class GenerateProfiles extends Configured implements Tool { private static final Log log = LogFactory.getLog(GenerateProfiles.class); // Sample line format: // 37518 - - [16/Jun/1998:02:48:36 +0000] \ // "GET /images/hm_hola.gif HTTP/1.0" 200 2240 private static final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)"; private static final Pattern pat = Pattern.compile(logEntryRegex); private final static IntWritable one = new IntWritable(1); public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, LongWritable, IntWritable> { int mapcount = 0; public void map(LongWritable key, Text rec, OutputCollector<LongWritable, IntWritable> output, Reporter reporter) throws IOException { try { String line = rec.toString(); Matcher matcher = pat.matcher(line); if (!matcher.matches() || 7 != matcher.groupCount()) { throw new RuntimeException("match failed on: " + line); } long userid = Long.parseLong(matcher.group(1)); output.collect(new LongWritable(userid), one); } catch (Exception ex) { // log.error("exception in map", ex); } } } private static class Profile implements Writable { public long userid; public int age; public int isMale; public Profile(long userid, int age, int isMale) { this.userid = userid; this.age = age; this.isMale = isMale; } public void readFields(DataInput in) throws IOException { userid = in.readLong(); age = in.readInt(); isMale = in.readInt(); } public void write(DataOutput out) throws IOException { out.writeLong(userid); out.writeInt(age); out.writeInt(isMale); } } public static class Reduce extends MapReduceBase implements Reducer<LongWritable, IntWritable, LongWritable, Profile> { public void reduce(LongWritable userid, Iterator<IntWritable> ones, OutputCollector<LongWritable, Profile> output, Reporter reporter ) throws IOException { // Fake age based on userid. int age = ((int) userid.get() % 40) + 20; // Fake gender based on userid. int isMale = (int) userid.get() % 2; Profile profile = new Profile(userid.get(), age, isMale); output.collect(userid, profile); } } public static class ProfileOutputFormat extends AerospikeOutputFormat<LongWritable, Profile> { public static class ProfileRecordWriter extends AerospikeRecordWriter<LongWritable, Profile> { public ProfileRecordWriter(Configuration cfg, Progressable progressable) { super(cfg); } @Override public void writeAerospike(LongWritable userid, Profile profile, AerospikeClient client, WritePolicy writePolicy, String namespace, String setName) throws IOException { writePolicy.totalTimeout = 10000; Key kk = new Key(namespace, setName, userid.get()); Bin bin0 = new Bin("userid", profile.userid); Bin bin1 = new Bin("age", profile.age); Bin bin2 = new Bin("isMale", profile.isMale); client.put(writePolicy, kk, bin0, bin1, bin2); } } public RecordWriter<LongWritable, Profile> getAerospikeRecordWriter(Configuration conf, Progressable prog) { return new ProfileRecordWriter(conf, prog); } } public int run(final String[] args) throws Exception { log.info("run starting"); final Configuration conf = getConf(); JobConf job = new JobConf(conf, GenerateProfiles.class); job.setJobName("AerospikeGenerateProfiles"); job.setMapperClass(Map.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(IntWritable.class); // job.setCombinerClass(Reduce.class); // Reduce changes format. job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Profile.class); job.setOutputFormat(ProfileOutputFormat.class); for (int ii = 0; ii < args.length; ++ii) FileInputFormat.addInputPath(job, new Path(args[ii])); JobClient.runJob(job); log.info("finished"); return 0; } public static void main(final String[] args) throws Exception { System.exit(ToolRunner.run(new GenerateProfiles(), args)); } } // Local Variables: // mode: java // c-basic-offset: 4 // tab-width: 4 // indent-tabs-mode: nil // End: // vim: softtabstop=4:shiftwidth=4:expandtab