java source code of SolrReader

package com.easy.hive.reader;

import com.easy.hive.conf.Conf;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

/**
 * Created by qindongliang on 2016/3/14.
 */
public class SolrReader implements RecordReader<LongWritable,MapWritable> {


    //SolrClient
    protected SolrClient sc;
    //查询对象
    protected SolrQuery sq;
    protected String cursorMark = null;//游标
    protected QueryResponse response;//结果集请求
    protected SolrDocumentList docs;//doc结果集

    private int hits;//命中数量

    int pos;    //记录当前的处理的doc数量

    private String[] cols;//读取hive元数据所有的列

    private JobConf conf; //MR任务封装的JobConf对象

    private int solrBatchSize; // solr游标批量读取的大小

    private String solr_pk; //solr的主键

    private String solr_query; //solr的查询字符串

    int current; //循环处理数据时的增量标记



    public SolrReader(SolrClient sc, SolrQuery sq, String cursorMark, JobConf jobConf) {
        this.conf=jobConf;
        this.sc = sc;
        this.sq = sq;
        this.cursorMark = cursorMark;
        cols=Conf.getAllClos(jobConf.get(Conf.COLUMNS));
        solrBatchSize=Integer.parseInt(conf.get(Conf.SOLR_CURSOR_BATCH_SIZE));
        solr_pk=conf.get(Conf.SOLR_PK);
        solr_query=conf.get(Conf.SOLR_QUERY);
        //初始化SolrQuery , 在构造函数执行时，先执行一次查询
        init(sq,cursorMark);
        try {
            response = sc.query(sq);
            docs = response.getResults();
            hits = (int) docs.getNumFound();
        } catch (Exception e) {
            e.printStackTrace();
        }


    }

    /***
     * 初始化SolrQuery
     * @param sq  SolrQuery
     * @param cursorMark 游标
     */
    public void init(SolrQuery sq,String cursorMark){
        sq.setSort(solr_pk, SolrQuery.ORDER.asc);
        sq.set("cursorMark", cursorMark);
        sq.set("q",solr_query);
        sq.setRows(solrBatchSize);
    }

    /**
     * 将solr的document封装进MapWritable里面
     * @param doc 转换的solr document
     * @param value MapWritable
     */
    public void collect(SolrDocument doc,MapWritable value){
        for (String col : cols) {
            Object vObj = doc.get(col);
            Writable v = (vObj == null) ? NullWritable.get() : new Text(vObj.toString());
            value.put(new Text(col), v);
        }
    }


    final static Logger log= LoggerFactory.getLogger(SolrReader.class);
    @Override
    public boolean next(LongWritable key, MapWritable value) throws IOException {

        if (cursorMark != null) {
            //由于在构造函数执行时，先查询了一次请求，所以下面的if会进去，然后处理完这一批数据再进else逻辑
            //查询一次后会缓存doclist，等待遍历完
            if (current < docs.size()) {
                key.set(pos++);
                SolrDocument doc = docs.get(current);
                //转换doc到value
                collect(doc,value);
                current++;
                return true;
            } else {//如果遍历完后，进行下一批游标查询
                key.set(pos++);
                try {
                    //前一次查询的数据已经处理完，重新发起请求，得到数据
                    init(sq,response.getNextCursorMark());
                    response = sc.query(sq);
                    docs = response.getResults();
                    hits = (int) docs.getNumFound();
                    if(docs.size()==0){
                        return false;
                    }
                    //更新游标记录
                    if (cursorMark != null) {
                        String nextCursorMark = response.getNextCursorMark();
                        if (cursorMark.equals(nextCursorMark)) {//没有更多的记录
                            return false;
                        }
                        cursorMark = nextCursorMark;
                    }
                    /*这个地方，为什么一定要先处理一条呢，返回继续循环呢？
                     *因为每次next=ture时候，都会进行一次反序列化操作,并将数据append到hive的行里面
                     *如果你不get(0)，而直接设置current=0，那么将会出现数据重复，next=true，将会
                     *继续把上一次MapWritable的值，再次追加到hive的行里面，从而会最后一条重复的数据
                     *这个地方，也不能return false跳过，因为false意味着整个读取流程结束。
                     */
                    SolrDocument doc = docs.get(0);
                    collect(doc,value);
                    current = 1;
                    return true;

                } catch (Exception e) {
                    e.printStackTrace();
                }
            }


        }
        return false;
    }

    @Override
    public LongWritable createKey() {
        return new LongWritable();
    }

    @Override
    public MapWritable createValue() {
        return new MapWritable();
    }

    @Override
    public long getPos() throws IOException {
        return 0;
    }

    @Override
    public void close() throws IOException {
        sc.close();
    }

    @Override
    public float getProgress() throws IOException {
        return 1.0F;
    }
}