package com.mashibing.transformer.mr; import java.io.IOException; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.Filter; import org.apache.hadoop.hbase.filter.MultipleColumnPrefixFilter; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import com.google.common.collect.Lists; import com.mashibing.common.EventLogConstants; import com.mashibing.common.GlobalConstants; import com.mashibing.util.TimeUtil; /** * 所有transformer相关mr程序入口类的公用父类 * * @author 马士兵教育 * */ public abstract class TransformerBaseRunner implements Tool { private static final Logger logger = Logger.getLogger(TransformerBaseRunner.class); protected Configuration conf; private String jobName; private Class<?> runnerClass; private Class<? extends TableMapper<?, ?>> mapperClass; private Class<? extends Reducer<?, ?, ?, ?>> reducerClass; private Class<? extends OutputFormat<?, ?>> outputFormatClass; private Class<? extends WritableComparable<?>> mapOutputKeyClass; private Class<? extends Writable> mapOutputValueClass; private Class<?> outputKeyClass; private Class<?> outputValueClass; private long startTime; private boolean isCallSetUpRunnerMethod = false; /** * 设置job参数 * * @param jobName * job名称 * @param runnerClass * runne class * @param mapperClass * mapper的class * @param reducerClass * reducer的class * @param outputKeyClass * 输出key * @param outputValueClass * 输出value */ public void setupRunner(String jobName, Class<?> runnerClass, Class<? extends TableMapper<?, ?>> mapperClass, Class<? extends Reducer<?, ?, ?, ?>> reducerClass, Class<? extends WritableComparable<?>> outputKeyClass, Class<? extends Writable> outputValueClass) { this.setupRunner(jobName, runnerClass, mapperClass, reducerClass, outputKeyClass, outputValueClass, outputKeyClass, outputValueClass, TransformerOutputFormat.class); } /** * 设置参数 * * @param jobName * @param runnerClass * @param mapperClass * @param reducerClass * @param outputKeyClass * @param outputValueClass * @param outputFormatClass */ public void setupRunner(String jobName, Class<?> runnerClass, Class<? extends TableMapper<?, ?>> mapperClass, Class<? extends Reducer<?, ?, ?, ?>> reducerClass, Class<? extends WritableComparable<?>> outputKeyClass, Class<? extends Writable> outputValueClass, Class<? extends OutputFormat<?, ?>> outputFormatClass) { this.setupRunner(jobName, runnerClass, mapperClass, reducerClass, outputKeyClass, outputValueClass, outputKeyClass, outputValueClass, outputFormatClass); } /** * 设置参数 * * @param jobName * @param runnerClass * @param mapperClass * @param reducerClass * @param mapOutputKeyClass * @param mapOutputValueClass * @param outputKeyClass * @param outputValueClass */ public void setupRunner(String jobName, Class<?> runnerClass, Class<? extends TableMapper<?, ?>> mapperClass, Class<? extends Reducer<?, ?, ?, ?>> reducerClass, Class<? extends WritableComparable<?>> mapOutputKeyClass, Class<? extends Writable> mapOutputValueClass, Class<? extends WritableComparable<?>> outputKeyClass, Class<? extends Writable> outputValueClass) { this.setupRunner(jobName, runnerClass, mapperClass, reducerClass, mapOutputKeyClass, mapOutputValueClass, outputKeyClass, outputValueClass, TransformerOutputFormat.class); } /** * 具体设置参数 * * @param jobName * @param runnerClass * @param mapperClass * @param reducerClass * @param mapOutputKeyClass * @param mapOutputValueClass * @param outputKeyClass * @param outputValueClass * @param outputFormatClass */ public void setupRunner(String jobName, Class<?> runnerClass, Class<? extends TableMapper<?, ?>> mapperClass, Class<? extends Reducer<?, ?, ?, ?>> reducerClass, Class<? extends WritableComparable<?>> mapOutputKeyClass, Class<? extends Writable> mapOutputValueClass, Class<? extends WritableComparable<?>> outputKeyClass, Class<? extends Writable> outputValueClass, Class<? extends OutputFormat<?, ?>> outputFormatClass) { this.jobName = jobName; this.runnerClass = runnerClass; this.mapperClass = mapperClass; this.reducerClass = reducerClass; this.mapOutputKeyClass = mapOutputKeyClass; this.mapOutputValueClass = mapOutputValueClass; this.outputKeyClass = outputKeyClass; this.outputValueClass = outputValueClass; this.outputFormatClass = outputFormatClass; this.isCallSetUpRunnerMethod = true; } /** * 代码执行函数 * * @throws Exception */ public void startRunner(String[] args) throws Exception { ToolRunner.run(new Configuration(), this, args); } @Override public void setConf(Configuration conf) { // Configuration的执行顺序是:按照resource的添加(add)顺序添加的,后面添加的会覆盖前面添加的。 // 但是有一点需要注意,就是如果某一个值已经在内存中了(从文件中读入到内存), 那么此时在今天添加文件操作,不会产生覆盖效果。 // 假设: a.xml文件中有一对key/value是fs.defaultFS=file:///; // b.xml文件中有一对key/value是fs.defaultFS=hdfs://hh:8020: // 执行顺序,1. 添加a.xml文件;2. 获取fs.defaultFS值;3.添加b.xml文件; 4. 获取fs.defaultFs的值 // 结果: 2和4都是返回的是file:/// // 添加自定义的配置文件 conf.addResource("transformer-env.xml"); conf.addResource("query-mapping.xml"); conf.addResource("output-collector.xml"); // 创建hbase相关的config对象(包含hbase配置文件) // hbase创建config的时候,会将指定参数的configuration所有的内容加载到内存中。 this.conf = HBaseConfiguration.create(conf); // 调用设置自定义的函数 this.configure(); } @Override public Configuration getConf() { return this.conf; } @Override public int run(String[] args) throws Exception { if (!this.isCallSetUpRunnerMethod) { throw new RuntimeException("必须调用setupRunner方法进行参数设置"); } Configuration conf = this.getConf(); // 获取configuration对象 // 初始化参数 this.processArgs(conf, args); Job job = this.initJob(conf); // 创建job // 执行job this.beforeRunJob(job); // 在job执行前运行 Throwable error = null; try { this.startTime = System.currentTimeMillis(); return job.waitForCompletion(true) ? 0 : -1; } catch (Throwable e) { error = e; logger.error("执行" + this.jobName + "job出现异常", e); throw new RuntimeException(e); } finally { this.afterRunJob(job, error); // 在代码执行后运行 } } /** * 创建job * * @param conf * @return * @throws IOException */ protected Job initJob(Configuration conf) throws IOException { Job job = Job.getInstance(conf, this.jobName); job.setJarByClass(this.runnerClass); // 本地运行 TableMapReduceUtil.initTableMapperJob(initScans(job), this.mapperClass, this.mapOutputKeyClass, this.mapOutputValueClass, job, false); // 集群运行:本地提交和打包(jar)提交 // TableMapReduceUtil.initTableMapperJob(initScans(job), // this.mapperClass, this.mapOutputKeyClass, this.mapOutputValueClass, // job); job.setReducerClass(this.reducerClass); job.setOutputKeyClass(this.outputKeyClass); job.setOutputValueClass(this.outputValueClass); job.setOutputFormatClass(this.outputFormatClass); return job; } /** * 在执行job前运行该方法 * * @param job * @throws IOException */ protected void beforeRunJob(Job job) throws IOException { // nothing } /** * 在执行后运行该方法 * * @param job * job对象 * @param error * job运行期间产生的异常信息,如果没有产生异常,那么传入null。 * @throws IOException */ protected void afterRunJob(Job job, Throwable error) throws IOException { try { // 结束的毫秒数 long endTime = System.currentTimeMillis(); logger.info("Job<" + this.jobName + ">是否执行成功:" + (error == null ? job.isSuccessful() : "false") + "; 开始时间:" + startTime + "; 结束时间:" + endTime + "; 用时:" + (endTime - startTime) + "ms" + (error == null ? "" : "; 异常信息为:" + error.getMessage())); } catch (Throwable e) { // nothing } } /** * 将指定的配置文件信息添加到config中去。 * * @param resourceFiles */ protected void configure(String... resourceFiles) { if (this.conf == null) { this.conf = HBaseConfiguration.create(); } // 开始添加指定的资源文件 if (resourceFiles != null) { for (String resource : resourceFiles) { this.conf.addResource(resource); } } } /** * 处理参数 * * @param conf * @param args */ protected void processArgs(Configuration conf, String[] args) { String date = null; for (int i = 0; i < args.length; i++) { if ("-d".equals(args[i])) { if (i + 1 < args.length) { date = args[++i]; break; } } } // 要求date格式为: yyyy-MM-dd if (StringUtils.isBlank(date) || !TimeUtil.isValidateRunningDate(date)) { // date是一个无效时间数据 date = TimeUtil.getYesterday(); // 默认时间是昨天 } conf.set(GlobalConstants.RUNNING_DATE_PARAMES, date); } /** * 初始化scan集合 * * @param job * @return */ protected List<Scan> initScans(Job job) { Configuration conf = job.getConfiguration(); // 获取运行时间: yyyy-MM-dd String date = conf.get(GlobalConstants.RUNNING_DATE_PARAMES); long startDate = TimeUtil.parseString2Long(date); long endDate = startDate + GlobalConstants.DAY_OF_MILLISECONDS; Scan scan = new Scan(); // 定义hbase扫描的开始rowkey和结束rowkey scan.setStartRow(Bytes.toBytes("" + startDate)); scan.setStopRow(Bytes.toBytes("" + endDate)); scan.setAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME, Bytes.toBytes(EventLogConstants.HBASE_NAME_EVENT_LOGS)); Filter filter = this.fetchHbaseFilter(); if (filter != null) { scan.setFilter(filter); } // 优化设置cache scan.setBatch(500); scan.setCacheBlocks(true); // 启动cache blocks scan.setCaching(1000); // 设置每次返回的行数,默认值100,设置较大的值可以提高速度(减少rpc操作),但是较大的值可能会导致内存异常。 return Lists.newArrayList(scan); } /** * 获取hbase操作的过滤filter对象 * * @return */ protected Filter fetchHbaseFilter() { return null; } /** * 获取这个列名过滤的column * * @param columns * @return */ protected Filter getColumnFilter(String[] columns) { int length = columns.length; byte[][] filter = new byte[length][]; for (int i = 0; i < length; i++) { filter[i] = Bytes.toBytes(columns[i]); } return new MultipleColumnPrefixFilter(filter); } }