package org.elasticsearch.river.hbase; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.elasticsearch.action.bulk.BulkItemResponse; import org.elasticsearch.action.bulk.BulkRequestBuilder; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.facet.FacetBuilders; import org.elasticsearch.search.facet.statistical.StatisticalFacet; import org.hbase.async.DeleteRequest; import org.hbase.async.HBaseClient; import org.hbase.async.KeyValue; import org.hbase.async.Scanner; /** * A separate Thread that does the actual fetching and storing of data from an HBase cluster. * * @author Ravi Gairola */ class HBaseParser implements Runnable { private static final String TIMESTMAP_STATS = "timestamp_stats"; private final HBaseRiver river; private final ESLogger logger; private final HBaseCallbackLogger cbLogger; private int indexCounter; private HBaseClient client; private Scanner scanner; private boolean stopThread; HBaseParser(final HBaseRiver river) { this.river = river; this.logger = river.getLogger(); this.cbLogger = new HBaseCallbackLogger(this.logger, "HBase Parser"); } /** * Timing mechanism of the thread that determines when a parse operation is supposed to run. Waits for the predefined * interval until a new run is performed. The method checks every 1000ms if it should be parsing again. The first run is * done immediately once the thread is started. */ @Override public void run() { this.logger.info("HBase Import Thread has started"); long lastRun = 0; while (!this.stopThread) { if (lastRun + this.river.getInterval() < System.currentTimeMillis()) { lastRun = System.currentTimeMillis(); try { this.indexCounter = 0; parse(); } catch (Throwable t) { this.logger.error("An exception has been caught while parsing data from HBase", t); } if (!this.stopThread) { this.logger.info("HBase Import Thread is waiting for {} Seconds until the next run", this.river.getInterval() / 1000); } } try { Thread.sleep(1000); } catch (InterruptedException e) { this.logger.trace("HBase river parsing thread has been interrupted"); } } this.logger.info("HBase Import Thread has finished"); } /** * The actual parse implementation that connects to the HBase cluster and fetches all rows since the last import. Fetched * rows are added to an ElasticSearch Bulk Request with a size according to batchSize (default is 100). * * @throws InterruptedException * @throws Exception */ protected void parse() throws InterruptedException, Exception { this.logger.info("Parsing data from HBase"); try { this.client = new HBaseClient(this.river.getHosts()); this.logger.debug("Checking if table {} actually exists in HBase DB", this.river.getTable()); this.client.ensureTableExists(this.river.getTable()).addErrback(this.cbLogger); this.logger.debug("Fetching HBase Scanner"); this.scanner = this.client.newScanner(this.river.getTable()); this.scanner.setServerBlockCache(false); if (this.river.getFamily() != null) { this.scanner.setFamily(this.river.getFamily()); } if (this.river.getQualifiers() != null) { for (final String qualifier : this.river.getQualifiers().split(",")) { this.scanner.setQualifier(qualifier.trim().getBytes(this.river.getCharset())); } } setMinTimestamp(this.scanner); ArrayList<ArrayList<KeyValue>> rows; this.logger.debug("Starting to fetch rows"); while ((rows = this.scanner.nextRows(this.river.getBatchSize()).addErrback(this.cbLogger).joinUninterruptibly()) != null) { if (this.stopThread) { this.logger.info("Stopping HBase import in the midle of it"); break; } parseBulkOfRows(rows); } } finally { this.logger.debug("Closing HBase Scanner and Async Client"); if (this.scanner != null) { try { this.scanner.close().addErrback(this.cbLogger); } catch (Exception e) { this.logger.error("An Exception has been caught while closing the HBase Scanner", e, (Object[]) null); } } if (this.client != null) { try { this.client.shutdown().addErrback(this.cbLogger); } catch (Exception e) { this.logger.error("An Exception has been caught while shuting down the HBase client", e, (Object[]) null); } } } } /** * Run over a bulk of rows and process them. * * @param rows */ protected void parseBulkOfRows(final ArrayList<ArrayList<KeyValue>> rows) { this.logger.debug("Processing the next {} entries in HBase parsing process", rows.size()); final BulkRequestBuilder bulkRequest = this.river.getEsClient().prepareBulk(); final Map<String, byte[]> keyMapForDeletion = new HashMap<String, byte[]>(); for (final ArrayList<KeyValue> row : rows) { if (this.stopThread) { this.logger.info("Stopping HBase import in the midle of it"); break; } if (row.size() > 0) { final IndexRequestBuilder request = this.river.getEsClient().prepareIndex(this.river.getIndex(), this.river.getType()); final byte[] key = row.get(0).key(); final Map<String, Object> dataTree = readDataTree(row); request.setSource(dataTree); request.setTimestamp(String.valueOf(row.get(0).timestamp())); if (this.river.getIdField() == null) { final String keyString = new String(key, this.river.getCharset()); request.setId(keyString); keyMapForDeletion.put(keyString, key); } else { final String keyString = findKeyInDataTree(dataTree, this.river.getIdField()); keyMapForDeletion.put(keyString, key); } bulkRequest.add(request); } } final BulkResponse response = bulkRequest.execute().actionGet(); this.indexCounter += response.items().length; this.logger.info("HBase river has indexed {} entries so far", this.indexCounter); final List<byte[]> failedKeys = new ArrayList<byte[]>(); if (response.hasFailures()) { for (BulkItemResponse r : response.items()) { if (r.failed()) { failedKeys.add(keyMapForDeletion.remove(r.getId())); } } this.logger.error("Errors have occured while trying to index new data from HBase"); this.logger.debug("Failed keys are {}", failedKeys); } if (this.river.getDeleteOld()) { for (Entry<String, byte[]> keyEntry : keyMapForDeletion.entrySet()) { this.client.delete(new DeleteRequest(this.river.getTable().getBytes(), keyEntry.getValue())).addErrback(this.cbLogger); } } } @SuppressWarnings("unchecked") protected String findKeyInDataTree(final Map<String, Object> dataTree, final String keyPath) { if (!keyPath.contains(this.river.getColumnSeparator())) { return (String) dataTree.get(keyPath); } final String key = keyPath.substring(0, keyPath.indexOf(this.river.getColumnSeparator())); if (dataTree.get(key) instanceof Map) { final int subKeyIndex = keyPath.indexOf(this.river.getColumnSeparator()) + this.river.getColumnSeparator().length(); return findKeyInDataTree((Map<String, Object>) dataTree.get(key), keyPath.substring(subKeyIndex)); } return null; } /** * Generate a tree structure that ElasticSearch can read and index from one of the rows that has been returned from * HBase. * * @param row * @return */ @SuppressWarnings("unchecked") protected Map<String, Object> readDataTree(final ArrayList<KeyValue> row) { final Map<String, Object> dataTree = new HashMap<String, Object>(); for (final KeyValue column : row) { final String family = this.river.normalizeField(new String(column.family(), this.river.getCharset())); final String qualifier = new String(column.qualifier(), this.river.getCharset()); final String value = new String(column.value(), this.river.getCharset()); if (!dataTree.containsKey(family)) { dataTree.put(family, new HashMap<String, Object>()); } readQualifierStructure((Map<String, Object>) dataTree.get(family), qualifier, value); } return dataTree; } /** * Will separate a column into sub column and return the value at the right json tree level. * * @param parent * @param qualifier * @param value */ @SuppressWarnings("unchecked") protected void readQualifierStructure(final Map<String, Object> parent, final String qualifier, final String value) { if (this.river.getColumnSeparator() != null && !this.river.getColumnSeparator().isEmpty()) { final int separatorPos = qualifier.indexOf(this.river.getColumnSeparator()); if (separatorPos != -1) { final String parentQualifier = this.river.normalizeField(qualifier.substring(0, separatorPos)); final String childQualifier = qualifier.substring(separatorPos + this.river.getColumnSeparator().length()); if (!childQualifier.isEmpty()) { if (!(parent.get(parentQualifier) instanceof Map)) { parent.put(parentQualifier, new HashMap<String, Object>()); } readQualifierStructure((Map<String, Object>) parent.get(parentQualifier), childQualifier, value); return; } parent.put(this.river.normalizeField(qualifier.replace(this.river.getColumnSeparator(), "")), value); return; } } parent.put(this.river.normalizeField(qualifier), value); } /** * Checks if there is an open Scanner or Client and closes them. */ public synchronized void stopThread() { this.stopThread = true; } /** * Sets the minimum time stamp on the HBase scanner, by looking into Elasticsearch for the last entry made. * * @param scanner */ protected long setMinTimestamp(final Scanner scanner) { this.logger.debug("Looking into ElasticSearch to determine timestamp of last import"); final SearchResponse response = this.river.getEsClient() .prepareSearch(this.river.getIndex()) .setTypes(this.river.getType()) .setQuery(QueryBuilders.matchAllQuery()) .addFacet(FacetBuilders.statisticalFacet(TIMESTMAP_STATS).field("_timestamp")) .execute() .actionGet(); if (response.facets().facet(TIMESTMAP_STATS) != null) { this.logger.debug("Got statistical data from ElasticSearch about data timestamps"); final StatisticalFacet facet = (StatisticalFacet) response.facets().facet(TIMESTMAP_STATS); final long timestamp = (long) Math.max(facet.getMax() + 1, 0); scanner.setMinTimestamp(timestamp); this.logger.debug("Found latest timestamp in ElasticSearch to be {}", timestamp); return timestamp; } this.logger.debug("No statistical data about data timestamps could be found -> probably no data there yet"); scanner.setMinTimestamp(0); this.logger.debug("Found latest timestamp in ElasticSearch to be not present (-> 0)"); return 0L; } }