/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.phoenix.hbase.index; import static org.apache.phoenix.hbase.index.util.IndexManagementUtil.rethrowIndexingException; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CoprocessorEnvironment; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Durability; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; import org.apache.hadoop.hbase.coprocessor.ObserverContext; import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; import org.apache.hadoop.hbase.regionserver.InternalScanner; import org.apache.hadoop.hbase.regionserver.KeyValueScanner; import org.apache.hadoop.hbase.regionserver.MiniBatchOperationInProgress; import org.apache.hadoop.hbase.regionserver.ScanType; import org.apache.hadoop.hbase.regionserver.Store; import org.apache.hadoop.hbase.regionserver.wal.HLogKey; import org.apache.hadoop.hbase.regionserver.wal.WALEdit; import org.apache.hadoop.hbase.util.Pair; import org.apache.phoenix.hbase.index.builder.IndexBuildManager; import org.apache.phoenix.hbase.index.builder.IndexBuilder; import org.apache.phoenix.hbase.index.table.HTableInterfaceReference; import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr; import org.apache.phoenix.hbase.index.util.IndexManagementUtil; import org.apache.phoenix.hbase.index.util.VersionUtil; import org.apache.phoenix.hbase.index.wal.IndexedKeyValue; import org.apache.phoenix.hbase.index.write.IndexFailurePolicy; import org.apache.phoenix.hbase.index.write.IndexWriter; import org.apache.phoenix.hbase.index.write.recovery.PerRegionIndexWriteCache; import org.apache.phoenix.hbase.index.write.recovery.StoreFailuresInCachePolicy; import org.apache.phoenix.hbase.index.write.recovery.TrackingParallelWriterIndexCommitter; import org.apache.phoenix.trace.TracingUtils; import org.apache.phoenix.trace.util.NullSpan; import org.apache.htrace.Span; import org.apache.htrace.Trace; import org.apache.htrace.TraceScope; import com.google.common.collect.Multimap; /** * Do all the work of managing index updates from a single coprocessor. All Puts/Delets are passed * to an {@link IndexBuilder} to determine the actual updates to make. * <p> * If the WAL is enabled, these updates are then added to the WALEdit and attempted to be written to * the WAL after the WALEdit has been saved. If any of the index updates fail, this server is * immediately terminated and we rely on WAL replay to attempt the index updates again (see * {@link #preWALRestore(ObserverContext, HRegionInfo, HLogKey, WALEdit)}). * <p> * If the WAL is disabled, the updates are attempted immediately. No consistency guarantees are made * if the WAL is disabled - some or none of the index updates may be successful. All updates in a * single batch must have the same durability level - either everything gets written to the WAL or * nothing does. Currently, we do not support mixed-durability updates within a single batch. If you * want to have different durability levels, you only need to split the updates into two different * batches. */ public class Indexer extends BaseRegionObserver { private static final Log LOG = LogFactory.getLog(Indexer.class); protected IndexWriter writer; protected IndexBuildManager builder; /** Configuration key for the {@link IndexBuilder} to use */ public static final String INDEX_BUILDER_CONF_KEY = "index.builder"; /** * Configuration key for if the indexer should check the version of HBase is running. Generally, * you only want to ignore this for testing or for custom versions of HBase. */ public static final String CHECK_VERSION_CONF_KEY = "com.saleforce.hbase.index.checkversion"; private static final String INDEX_RECOVERY_FAILURE_POLICY_KEY = "org.apache.hadoop.hbase.index.recovery.failurepolicy"; /** * cache the failed updates to the various regions. Used for making the WAL recovery mechanisms * more robust in the face of recoverying index regions that were on the same server as the * primary table region */ private PerRegionIndexWriteCache failedIndexEdits = new PerRegionIndexWriteCache(); /** * IndexWriter for writing the recovered index edits. Separate from the main indexer since we need * different write/failure policies */ private IndexWriter recoveryWriter; private boolean stopped; private boolean disabled; public static final String RecoveryFailurePolicyKeyForTesting = INDEX_RECOVERY_FAILURE_POLICY_KEY; public static final int INDEXING_SUPPORTED_MAJOR_VERSION = VersionUtil .encodeMaxPatchVersion(0, 94); public static final int INDEXING_SUPPORTED__MIN_MAJOR_VERSION = VersionUtil .encodeVersion("0.94.0"); private static final int INDEX_WAL_COMPRESSION_MINIMUM_SUPPORTED_VERSION = VersionUtil .encodeVersion("0.94.9"); @Override public void start(CoprocessorEnvironment e) throws IOException { try { final RegionCoprocessorEnvironment env = (RegionCoprocessorEnvironment) e; String serverName = env.getRegionServerServices().getServerName().getServerName(); if (env.getConfiguration().getBoolean(CHECK_VERSION_CONF_KEY, true)) { // make sure the right version <-> combinations are allowed. String errormsg = Indexer.validateVersion(env.getHBaseVersion(), env.getConfiguration()); if (errormsg != null) { IOException ioe = new IOException(errormsg); env.getRegionServerServices().abort(errormsg, ioe); throw ioe; } } this.builder = new IndexBuildManager(env); // setup the actual index writer this.writer = new IndexWriter(env, serverName + "-index-writer"); // setup the recovery writer that does retries on the failed edits TrackingParallelWriterIndexCommitter recoveryCommmiter = new TrackingParallelWriterIndexCommitter(); try { // get the specified failure policy. We only ever override it in tests, but we need to do it // here Class<? extends IndexFailurePolicy> policyClass = env.getConfiguration().getClass(INDEX_RECOVERY_FAILURE_POLICY_KEY, StoreFailuresInCachePolicy.class, IndexFailurePolicy.class); IndexFailurePolicy policy = policyClass.getConstructor(PerRegionIndexWriteCache.class).newInstance(failedIndexEdits); LOG.debug("Setting up recovery writter with committer: " + recoveryCommmiter.getClass() + " and failure policy: " + policy.getClass()); recoveryWriter = new IndexWriter(recoveryCommmiter, policy, env, serverName + "-recovery-writer"); } catch (Exception ex) { throw new IOException("Could not instantiate recovery failure policy!", ex); } } catch (NoSuchMethodError ex) { disabled = true; super.start(e); LOG.error("Must be too early a version of HBase. Disabled coprocessor ", ex); } } @Override public void stop(CoprocessorEnvironment e) throws IOException { if (this.stopped) { return; } if (this.disabled) { super.stop(e); return; } this.stopped = true; String msg = "Indexer is being stopped"; this.builder.stop(msg); this.writer.stop(msg); this.recoveryWriter.stop(msg); } @Override public void preBatchMutate(ObserverContext<RegionCoprocessorEnvironment> c, MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { if (this.disabled) { super.preBatchMutate(c, miniBatchOp); return; } try { preBatchMutateWithExceptions(c, miniBatchOp); return; } catch (Throwable t) { rethrowIndexingException(t); } throw new RuntimeException( "Somehow didn't return an index update but also didn't propagate the failure to the client!"); } public void preBatchMutateWithExceptions(ObserverContext<RegionCoprocessorEnvironment> c, MiniBatchOperationInProgress<Mutation> miniBatchOp) throws Throwable { // first group all the updates for a single row into a single update to be processed Map<ImmutableBytesPtr, MultiMutation> mutations = new HashMap<ImmutableBytesPtr, MultiMutation>(); Durability defaultDurability = Durability.SYNC_WAL; if(c.getEnvironment().getRegion() != null) { defaultDurability = c.getEnvironment().getRegion().getTableDesc().getDurability(); defaultDurability = (defaultDurability == Durability.USE_DEFAULT) ? Durability.SYNC_WAL : defaultDurability; } Durability durability = Durability.SKIP_WAL; for (int i = 0; i < miniBatchOp.size(); i++) { Mutation m = miniBatchOp.getOperation(i); // skip this mutation if we aren't enabling indexing // unfortunately, we really should ask if the raw mutation (rather than the combined mutation) // should be indexed, which means we need to expose another method on the builder. Such is the // way optimization go though. if (!this.builder.isEnabled(m)) { continue; } Durability effectiveDurablity = (m.getDurability() == Durability.USE_DEFAULT) ? defaultDurability : m.getDurability(); if (effectiveDurablity.ordinal() > durability.ordinal()) { durability = effectiveDurablity; } // add the mutation to the batch set ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow()); MultiMutation stored = mutations.get(row); // we haven't seen this row before, so add it if (stored == null) { stored = new MultiMutation(row); mutations.put(row, stored); } stored.addAll(m); } // early exit if it turns out we don't have any edits if (mutations.entrySet().size() == 0) { return; } // dump all the index updates into a single WAL. They will get combined in the end anyways, so // don't worry which one we get WALEdit edit = miniBatchOp.getWalEdit(0); if (edit == null) { edit = new WALEdit(); miniBatchOp.setWalEdit(0, edit); } // get the current span, or just use a null-span to avoid a bunch of if statements try (TraceScope scope = Trace.startSpan("Starting to build index updates")) { Span current = scope.getSpan(); if (current == null) { current = NullSpan.INSTANCE; } // get the index updates for all elements in this batch Collection<Pair<Mutation, byte[]>> indexUpdates = this.builder.getIndexUpdate(miniBatchOp, mutations.values()); current.addTimelineAnnotation("Built index updates, doing preStep"); TracingUtils.addAnnotation(current, "index update count", indexUpdates.size()); // write them, either to WAL or the index tables doPre(indexUpdates, edit, durability); } } private class MultiMutation extends Mutation { private ImmutableBytesPtr rowKey; public MultiMutation(ImmutableBytesPtr rowkey) { this.rowKey = rowkey; } /** * @param stored */ public void addAll(Mutation stored) { // add all the kvs for (Entry<byte[], List<Cell>> kvs : stored.getFamilyCellMap().entrySet()) { byte[] family = kvs.getKey(); List<Cell> list = getKeyValueList(family, kvs.getValue().size()); list.addAll(kvs.getValue()); familyMap.put(family, list); } // add all the attributes, not overriding already stored ones for (Entry<String, byte[]> attrib : stored.getAttributesMap().entrySet()) { if (this.getAttribute(attrib.getKey()) == null) { this.setAttribute(attrib.getKey(), attrib.getValue()); } } } private List<Cell> getKeyValueList(byte[] family, int hint) { List<Cell> list = familyMap.get(family); if (list == null) { list = new ArrayList<Cell>(hint); } return list; } @Override public byte[] getRow(){ return this.rowKey.copyBytesIfNecessary(); } @Override public int hashCode() { return this.rowKey.hashCode(); } @Override public boolean equals(Object o) { return o == null ? false : o.hashCode() == this.hashCode(); } } /** * Add the index updates to the WAL, or write to the index table, if the WAL has been disabled * @return <tt>true</tt> if the WAL has been updated. * @throws IOException */ private boolean doPre(Collection<Pair<Mutation, byte[]>> indexUpdates, final WALEdit edit, final Durability durability) throws IOException { // no index updates, so we are done if (indexUpdates == null || indexUpdates.size() == 0) { return false; } // if writing to wal is disabled, we never see the WALEdit updates down the way, so do the index // update right away if (durability == Durability.SKIP_WAL) { try { this.writer.write(indexUpdates); return false; } catch (Throwable e) { LOG.error("Failed to update index with entries:" + indexUpdates, e); IndexManagementUtil.rethrowIndexingException(e); } } // we have all the WAL durability, so we just update the WAL entry and move on for (Pair<Mutation, byte[]> entry : indexUpdates) { edit.add(new IndexedKeyValue(entry.getSecond(), entry.getFirst())); } return true; } @Override public void postPut(ObserverContext<RegionCoprocessorEnvironment> e, Put put, WALEdit edit, final Durability durability) throws IOException { if (this.disabled) { super.postPut(e, put, edit, durability); return; } doPost(edit, put, durability); } @Override public void postDelete(ObserverContext<RegionCoprocessorEnvironment> e, Delete delete, WALEdit edit, final Durability durability) throws IOException { if (this.disabled) { super.postDelete(e, delete, edit, durability); return; } doPost(edit, delete, durability); } @Override public void postBatchMutate(ObserverContext<RegionCoprocessorEnvironment> c, MiniBatchOperationInProgress<Mutation> miniBatchOp) throws IOException { if (this.disabled) { super.postBatchMutate(c, miniBatchOp); return; } this.builder.batchCompleted(miniBatchOp); //each batch operation, only the first one will have anything useful, so we can just grab that Mutation mutation = miniBatchOp.getOperation(0); WALEdit edit = miniBatchOp.getWalEdit(0); doPost(edit, mutation, mutation.getDurability()); } private void doPost(WALEdit edit, Mutation m, final Durability durability) throws IOException { try { doPostWithExceptions(edit, m, durability); return; } catch (Throwable e) { rethrowIndexingException(e); } throw new RuntimeException( "Somehow didn't complete the index update, but didn't return succesfully either!"); } private void doPostWithExceptions(WALEdit edit, Mutation m, final Durability durability) throws Exception { //short circuit, if we don't need to do any work if (durability == Durability.SKIP_WAL || !this.builder.isEnabled(m)) { // already did the index update in prePut, so we are done return; } // get the current span, or just use a null-span to avoid a bunch of if statements try (TraceScope scope = Trace.startSpan("Completing index writes")) { Span current = scope.getSpan(); if (current == null) { current = NullSpan.INSTANCE; } // there is a little bit of excess here- we iterate all the non-indexed kvs for this check first // and then do it again later when getting out the index updates. This should be pretty minor // though, compared to the rest of the runtime IndexedKeyValue ikv = getFirstIndexedKeyValue(edit); /* * early exit - we have nothing to write, so we don't need to do anything else. NOTE: we don't * release the WAL Rolling lock (INDEX_UPDATE_LOCK) since we never take it in doPre if there are * no index updates. */ if (ikv == null) { return; } /* * only write the update if we haven't already seen this batch. We only want to write the batch * once (this hook gets called with the same WALEdit for each Put/Delete in a batch, which can * lead to writing all the index updates for each Put/Delete). */ if (!ikv.getBatchFinished()) { Collection<Pair<Mutation, byte[]>> indexUpdates = extractIndexUpdate(edit); // the WAL edit is kept in memory and we already specified the factory when we created the // references originally - therefore, we just pass in a null factory here and use the ones // already specified on each reference try { current.addTimelineAnnotation("Actually doing index update for first time"); writer.writeAndKillYourselfOnFailure(indexUpdates); } finally { // With a custom kill policy, we may throw instead of kill the server. // Without doing this in a finally block (at least with the mini cluster), // the region server never goes down. // mark the batch as having been written. In the single-update case, this never gets check // again, but in the batch case, we will check it again (see above). ikv.markBatchFinished(); } } } } /** * Search the {@link WALEdit} for the first {@link IndexedKeyValue} present * @param edit {@link WALEdit} * @return the first {@link IndexedKeyValue} in the {@link WALEdit} or <tt>null</tt> if not * present */ private IndexedKeyValue getFirstIndexedKeyValue(WALEdit edit) { for (Cell kv : edit.getCells()) { if (kv instanceof IndexedKeyValue) { return (IndexedKeyValue) kv; } } return null; } /** * Extract the index updates from the WAL Edit * @param edit to search for index updates * @return the mutations to apply to the index tables */ private Collection<Pair<Mutation, byte[]>> extractIndexUpdate(WALEdit edit) { Collection<Pair<Mutation, byte[]>> indexUpdates = new ArrayList<Pair<Mutation, byte[]>>(); for (Cell kv : edit.getCells()) { if (kv instanceof IndexedKeyValue) { IndexedKeyValue ikv = (IndexedKeyValue) kv; indexUpdates.add(new Pair<Mutation, byte[]>(ikv.getMutation(), ikv.getIndexTable())); } } return indexUpdates; } @Override public void postOpen(final ObserverContext<RegionCoprocessorEnvironment> c) { Multimap<HTableInterfaceReference, Mutation> updates = failedIndexEdits.getEdits(c.getEnvironment().getRegion()); if (this.disabled) { super.postOpen(c); return; } LOG.info("Found some outstanding index updates that didn't succeed during" + " WAL replay - attempting to replay now."); //if we have no pending edits to complete, then we are done if (updates == null || updates.size() == 0) { return; } // do the usual writer stuff, killing the server again, if we can't manage to make the index // writes succeed again try { writer.writeAndKillYourselfOnFailure(updates); } catch (IOException e) { LOG.error("Exception thrown instead of killing server during index writing", e); } } @Override public void preWALRestore(ObserverContext<RegionCoprocessorEnvironment> env, HRegionInfo info, HLogKey logKey, WALEdit logEdit) throws IOException { if (this.disabled) { super.preWALRestore(env, info, logKey, logEdit); return; } // TODO check the regions in transition. If the server on which the region lives is this one, // then we should rety that write later in postOpen. // we might be able to get even smarter here and pre-split the edits that are server-local // into their own recovered.edits file. This then lets us do a straightforward recovery of each // region (and more efficiently as we aren't writing quite as hectically from this one place). /* * Basically, we let the index regions recover for a little while long before retrying in the * hopes they come up before the primary table finishes. */ Collection<Pair<Mutation, byte[]>> indexUpdates = extractIndexUpdate(logEdit); recoveryWriter.write(indexUpdates); } /** * Create a custom {@link InternalScanner} for a compaction that tracks the versions of rows that * are removed so we can clean then up from the the index table(s). * <p> * This is not yet implemented - its not clear if we should even mess around with the Index table * for these rows as those points still existed. TODO: v2 of indexing */ @Override public InternalScanner preCompactScannerOpen(ObserverContext<RegionCoprocessorEnvironment> c, Store store, List<? extends KeyValueScanner> scanners, ScanType scanType, long earliestPutTs, InternalScanner s) throws IOException { return super.preCompactScannerOpen(c, store, scanners, scanType, earliestPutTs, s); } /** * Exposed for testing! * @return the currently instantiated index builder */ public IndexBuilder getBuilderForTesting() { return this.builder.getBuilderForTesting(); } /** * Validate that the version and configuration parameters are supported * @param hbaseVersion current version of HBase on which <tt>this</tt> coprocessor is installed * @param conf configuration to check for allowed parameters (e.g. WAL Compression only if >= * 0.94.9) * @return <tt>null</tt> if the version is supported, the error message to display otherwise */ public static String validateVersion(String hbaseVersion, Configuration conf) { int encodedVersion = VersionUtil.encodeVersion(hbaseVersion); // above 0.94 everything should be supported if (encodedVersion > INDEXING_SUPPORTED_MAJOR_VERSION) { return null; } // check to see if its at least 0.94 if (encodedVersion < INDEXING_SUPPORTED__MIN_MAJOR_VERSION) { return "Indexing not supported for versions older than 0.94.X"; } // if less than 0.94.9, we need to check if WAL Compression is enabled if (encodedVersion < INDEX_WAL_COMPRESSION_MINIMUM_SUPPORTED_VERSION) { if (conf.getBoolean(HConstants.ENABLE_WAL_COMPRESSION, false)) { return "Indexing not supported with WAL Compression for versions of HBase older than 0.94.9 - found version:" + hbaseVersion; } } return null; } /** * Enable indexing on the given table * @param desc {@link HTableDescriptor} for the table on which indexing should be enabled * @param builder class to use when building the index for this table * @param properties map of custom configuration options to make available to your * {@link IndexBuilder} on the server-side * @param priority TODO * @throws IOException the Indexer coprocessor cannot be added */ public static void enableIndexing(HTableDescriptor desc, Class<? extends IndexBuilder> builder, Map<String, String> properties, int priority) throws IOException { if (properties == null) { properties = new HashMap<String, String>(); } properties.put(Indexer.INDEX_BUILDER_CONF_KEY, builder.getName()); desc.addCoprocessor(Indexer.class.getName(), null, priority, properties); } }