java source code of HBaseRiver

package org.elasticsearch.river.hbase;

import java.lang.Thread.UncaughtExceptionHandler;
import java.nio.charset.Charset;
import java.security.InvalidParameterException;
import java.util.Map;

import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.admin.indices.status.ShardStatus;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.shard.IndexShardState;
import org.elasticsearch.indices.IndexAlreadyExistsException;
import org.elasticsearch.river.AbstractRiverComponent;
import org.elasticsearch.river.River;
import org.elasticsearch.river.RiverName;
import org.elasticsearch.river.RiverSettings;

/**
 * An HBase import river built similar to the MySQL river, that was modeled after the Solr SQL import functionality.
 * 
 * @author Ravi Gairola
 */
public class HBaseRiver extends AbstractRiverComponent implements River, UncaughtExceptionHandler {
	private static final String	CONFIG_SPACE	= "hbase";
	private final Client		esClient;
	private volatile Runnable	parser;

	/**
	 * Comma separated list of Zookeeper hosts to which the HBase client can connect to find the cluster.
	 */
	private final String		hosts;

	/**
	 * The HBase table name to be imported from.
	 */
	private final String		table;

	/**
	 * The ElasticSearch index name to be imported to. (default is the river name)
	 */
	private final String		index;

	/**
	 * The ElasticSearch type name to be imported to. (Default is the source table name)
	 */
	private final String		type;

	/**
	 * The interval in ms with which the river is supposed to run (60000 = every minute). (Default is every 10 minutes)
	 */
	private final long			interval;

	/**
	 * How big are the ElasticSearch bulk indexing sizes supposed to be. Tweaking this might improve performance. (Default is
	 * 100 operations)
	 */
	private final int			batchSize;

	/**
	 * Name of the field from HBase to be used as an idField in ElasticSearch. The mapping will set up accordingly, so that
	 * the _id field is routed to this field name (you can access it then under both the field name and "_id"). If no id
	 * field is given, then ElasticSearch will automatically generate an id.
	 */
	private final String		idField;

	/**
	 * The char set which is used to parse data from HBase. (Default is UTF-8)
	 */
	private final Charset		charset;

	/**
	 * Limit the scanning of the HBase table to a certain family.
	 */
	private final byte[]		family;

	/**
	 * Limit the scanning of the HBase table to a number of qualifiers. A family must be set for this to take effect.
	 * Multiple qualifiers can be set via comma separated list.
	 */
	private final String		qualifiers;

	/**
	 * Some names must be given in a lower case format (the index name for example), others are more flexible. This flag will
	 * normalize all fields to lower case and remove special characters that ELasticSearch can't handle. (The filter is
	 * probably stricter than needed in most cases)
	 */
	private final boolean		normalizeFields;

	/**
	 * Splits up the column into further sub columns if a separator is defined. For example:
	 * 
	 * <pre>
	 * Separator: "-"
	 * Columns name: "this-is-my-column"
	 * Result:
	 * {
	 * 	this: {
	 * 		is: {
	 * 			my: {
	 * 				column: -value-
	 * 			}
	 * 		}
	 * 	}
	 * }
	 * </pre>
	 * 
	 * If no separator is defined, or the separator is empty, no operation is performed. Try to use single character
	 * separators, as multi character separators will allow partial hits of a separator to be part of the data. (e.g. A
	 * separator defined as "()" will leave all "(" and ")" in the parsed data.
	 */
	public final String			columnSeparator;

	/**
	 * Define a custom mapping that will be used instead of an automatically generated one. Make sure to enable time stamps
	 * and if you want an id-field to be recognized set the proper alias.
	 */
	public final String			customMapping;

	/**
	 * Setting if old entries that have just been read from HBase should be deleted after they've been read.
	 */
	private final boolean		deleteOld;

	/**
	 * Loads and verifies all the configuration needed to run this river.
	 * 
	 * @param riverName
	 * @param settings
	 * @param esClient
	 */
	@Inject
	public HBaseRiver(final RiverName riverName, final RiverSettings settings, final Client esClient) {
		super(riverName, settings);
		this.esClient = esClient;
		this.logger.info("Creating HBase Stream River");

		this.normalizeFields = Boolean.parseBoolean(readConfig("normalizeFields", "true"));
		this.hosts = readConfig("hosts");
		this.table = readConfig("table");
		this.columnSeparator = readConfig("columnSeparator", null);
		this.idField = normalizeField(readConfig("idField", null));
		this.index = normalizeField(readConfig("index", riverName.name()));
		this.type = normalizeField(readConfig("type", this.table));
		this.interval = Long.parseLong(readConfig("interval", "600000"));
		this.batchSize = Integer.parseInt(readConfig("batchSize", "100"));
		this.charset = Charset.forName(readConfig("charset", "UTF-8"));
		this.deleteOld = Boolean.parseBoolean(readConfig("deleteOld", "false"));

		final String family = readConfig("family", null);
		this.family = family != null ? family.getBytes(this.charset) : null;
		this.qualifiers = readConfig("qualifiers", null);
		this.customMapping = readConfig("customMapping", null);

		if (this.interval <= 0) {
			throw new IllegalArgumentException("The interval between runs must be at least 1 ms. The current config is set to "
					+ this.interval);
		}
		if (this.batchSize <= 0) {
			throw new IllegalArgumentException("The batch size must be set to at least 1. The current config is set to " + this.batchSize);
		}
	}

	/**
	 * Fetch the value of a configuration that has no default value and is therefore mandatory. Empty (trimmed) strings are
	 * as invalid as no value at all (null).
	 * 
	 * @param config Key of the configuration to fetch
	 * @throws InvalidParameterException if a configuration is missing (null or empty)
	 * @return
	 */
	private String readConfig(final String config) {
		final String result = readConfig(config, null);
		if (result == null || result.trim().isEmpty()) {
			this.logger.error("Unable to read required config {}. Aborting!", config);
			throw new InvalidParameterException("Unable to read required config " + config);
		}
		return result;
	}

	/**
	 * Fetch the value of a configuration that has a default value and is therefore optional.
	 * 
	 * @param config Key of the configuration to fetch
	 * @param defaultValue The value to set if no value could be found
	 * @return
	 */
	@SuppressWarnings({ "unchecked" })
	private String readConfig(final String config, final String defaultValue) {
		if (this.settings.settings().containsKey(CONFIG_SPACE)) {
			Map<String, Object> mysqlSettings = (Map<String, Object>) this.settings.settings().get(CONFIG_SPACE);
			return XContentMapValues.nodeStringValue(mysqlSettings.get(config), defaultValue);
		}
		return defaultValue;
	}

	/**
	 * This method is launched by ElasticSearch and starts the HBase River. The method will try to create a mapping with time
	 * stamps enabled. If a mapping already exists the user should make sure, that time stamps are enabled for this type.
	 */
	@Override
	public synchronized void start() {
		if (this.parser != null) {
			this.logger.warn("Trying to start HBase stream although it is already running");
			return;
		}
		this.parser = new HBaseParser(this);

		this.logger.info("Waiting for Index to be ready for interaction");
		waitForESReady();

		this.logger.info("Starting HBase Stream");
		String mapping;
		if (this.customMapping != null && !this.customMapping.trim().isEmpty()) {
			mapping = this.customMapping;
		}
		else {
			if (this.idField == null) {
				mapping = "{\"" + this.type + "\":{\"_timestamp\":{\"enabled\":true}}}";
			}
			if (this.columnSeparator != null) {
				mapping = "{\"" + this.type + "\":{\"_timestamp\":{\"enabled\":true},\"_id\":{\"path\":\""
						+ this.idField.replace(this.columnSeparator, ".") + "\"}}}";
			}
			else {
				mapping = "{\"" + this.type + "\":{\"_timestamp\":{\"enabled\":true},\"_id\":{\"path\":\"" + this.idField + "\"}}}";
			}
		}

		try {
			this.esClient.admin().indices().prepareCreate(this.index).addMapping(this.type, mapping).execute().actionGet();
			this.logger.info("Created Index {} with _timestamp mapping for {}", this.index, this.type);
		} catch (Exception e) {
			if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException) {
				this.logger.debug("Not creating Index {} as it already exists", this.index);
			}
			else if (ExceptionsHelper.unwrapCause(e) instanceof ElasticSearchException) {
				this.logger.debug("Mapping {}.{} already exists and will not be created", this.index, this.type);
			}
			else {
				this.logger.warn("failed to create index [{}], disabling river...", e, this.index);
				return;
			}
		}

		try {
			this.esClient.admin()
				.indices()
				.preparePutMapping(this.index)
				.setType(this.type)
				.setSource(mapping)
				.setIgnoreConflicts(true)
				.execute()
				.actionGet();
		} catch (ElasticSearchException e) {
			this.logger.debug("Mapping already exists for index {} and type {}", this.index, this.type);
		}

		final Thread t = EsExecutors.daemonThreadFactory(this.settings.globalSettings(), "hbase_slurper").newThread(this.parser);
		t.setUncaughtExceptionHandler(this);
		t.start();
	}

	private void waitForESReady() {
		if (!this.esClient.admin().indices().prepareExists(this.index).execute().actionGet().exists()) {
			return;
		}
		for (final ShardStatus status : this.esClient.admin().indices().prepareStatus(this.index).execute().actionGet().getShards()) {
			if (status.getState() != IndexShardState.STARTED) {
				try {
					Thread.sleep(1000);
				} catch (InterruptedException e) {
					this.logger.trace("HBase thread has been interrupted while waiting for the database to be reachable");
				}
				this.logger.trace("Waiting...");
				waitForESReady();
				break;
			}
		}
	}

	/**
	 * This method is called by ElasticSearch when shutting down the river. The method will stop the thread and close all
	 * connections to HBase.
	 */
	@Override
	public synchronized void close() {
		this.logger.info("Closing HBase river");
		if (this.parser instanceof HBaseParser) {
			((HBaseParser) this.parser).stopThread();
		}
		this.parser = null;
	}

	/**
	 * Some of the asynchronous methods of the HBase client will throw Exceptions that are not caught anywhere else.
	 */
	@Override
	public void uncaughtException(final Thread arg0, final Throwable arg1) {
		this.logger.error("An Exception has been thrown in HBase Import Thread", arg1, (Object[]) null);
	}

	/**
	 * If the normalizeField flag is set, this method will return a lower case representation of the field, as well as
	 * stripping away all special characters except "-" and "_".
	 * 
	 * @param fieldName
	 * @return
	 */
	public String normalizeField(final String fieldName) {
		if (!isNormalizeFields() || fieldName == null) {
			return fieldName;
		}
		if (getColumnSeparator() != null) {
			String regex = "a-z0-9\\-_";
			for (int i = 0; i < getColumnSeparator().length(); i++) {
				regex += "\\" + getColumnSeparator().charAt(i);
			}
			return fieldName.toLowerCase().replaceAll("[^" + regex + "]", "");
		}
		return fieldName.toLowerCase().replaceAll("[^a-z0-9\\-_]", "");
	}

	public boolean isNormalizeFields() {
		return this.normalizeFields;
	}

	public long getInterval() {
		return this.interval;
	}

	public String getTable() {
		return this.table;
	}

	public String getHosts() {
		return this.hosts;
	}

	public byte[] getFamily() {
		return this.family;
	}

	public String getQualifiers() {
		return this.qualifiers;
	}

	public Charset getCharset() {
		return this.charset;
	}

	public int getBatchSize() {
		return this.batchSize;
	}

	public Client getEsClient() {
		return this.esClient;
	}

	public String getIndex() {
		return this.index;
	}

	public String getType() {
		return this.type;
	}

	public String getIdField() {
		return this.idField;
	}

	public String getColumnSeparator() {
		return this.columnSeparator;
	}

	public ESLogger getLogger() {
		return this.logger;
	}

	public boolean getDeleteOld() {
		return this.deleteOld;
	}
}