/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.alipay.sofa.jraft.rhea.storage;

import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Function;

import org.apache.commons.io.FileUtils;
import org.rocksdb.BackupEngine;
import org.rocksdb.BackupInfo;
import org.rocksdb.BackupableDBOptions;
import org.rocksdb.BlockBasedTableConfig;
import org.rocksdb.Checkpoint;
import org.rocksdb.ColumnFamilyDescriptor;
import org.rocksdb.ColumnFamilyHandle;
import org.rocksdb.ColumnFamilyOptions;
import org.rocksdb.DBOptions;
import org.rocksdb.Env;
import org.rocksdb.EnvOptions;
import org.rocksdb.IngestExternalFileOptions;
import org.rocksdb.Options;
import org.rocksdb.ReadOptions;
import org.rocksdb.RestoreOptions;
import org.rocksdb.RocksDB;
import org.rocksdb.RocksDBException;
import org.rocksdb.RocksIterator;
import org.rocksdb.Snapshot;
import org.rocksdb.SstFileWriter;
import org.rocksdb.Statistics;
import org.rocksdb.StatisticsCollectorCallback;
import org.rocksdb.StatsCollectorInput;
import org.rocksdb.StringAppendOperator;
import org.rocksdb.WriteBatch;
import org.rocksdb.WriteOptions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.alipay.sofa.jraft.rhea.errors.StorageException;
import com.alipay.sofa.jraft.rhea.metadata.Region;
import com.alipay.sofa.jraft.rhea.options.RocksDBOptions;
import com.alipay.sofa.jraft.rhea.rocks.support.RocksStatisticsCollector;
import com.alipay.sofa.jraft.rhea.serialization.Serializer;
import com.alipay.sofa.jraft.rhea.serialization.Serializers;
import com.alipay.sofa.jraft.rhea.util.ByteArray;
import com.alipay.sofa.jraft.rhea.util.Lists;
import com.alipay.sofa.jraft.rhea.util.Maps;
import com.alipay.sofa.jraft.rhea.util.Partitions;
import com.alipay.sofa.jraft.rhea.util.StackTraceUtil;
import com.alipay.sofa.jraft.rhea.util.concurrent.DistributedLock;
import com.alipay.sofa.jraft.util.Bits;
import com.alipay.sofa.jraft.util.BytesUtil;
import com.alipay.sofa.jraft.util.DebugStatistics;
import com.alipay.sofa.jraft.util.Describer;
import com.alipay.sofa.jraft.util.Requires;
import com.alipay.sofa.jraft.util.StorageOptionsFactory;
import com.alipay.sofa.jraft.util.SystemPropertyUtil;
import com.alipay.sofa.jraft.util.concurrent.AdjustableSemaphore;
import com.codahale.metrics.Timer;

/**
 * Local KV store based on RocksDB
 *
 * @author dennis
 * @author jiachun.fjc
 */
public class RocksRawKVStore extends BatchRawKVStore<RocksDBOptions> implements Describer {

    private static final Logger                LOG                  = LoggerFactory.getLogger(RocksRawKVStore.class);

    static {
        RocksDB.loadLibrary();
    }

    // The maximum number of keys in once batch write
    public static final int                    MAX_BATCH_WRITE_SIZE = SystemPropertyUtil.getInt(
                                                                        "rhea.rocksdb.user.max_batch_write_size", 128);

    private final AdjustableSemaphore          shutdownLock         = new AdjustableSemaphore();
    private final ReadWriteLock                readWriteLock        = new ReentrantReadWriteLock();

    private final AtomicLong                   databaseVersion      = new AtomicLong(0);
    private final Serializer                   serializer           = Serializers.getDefault();

    private final List<ColumnFamilyOptions>    cfOptionsList        = Lists.newArrayList();
    private final List<ColumnFamilyDescriptor> cfDescriptors        = Lists.newArrayList();

    private ColumnFamilyHandle                 defaultHandle;
    private ColumnFamilyHandle                 sequenceHandle;
    private ColumnFamilyHandle                 lockingHandle;
    private ColumnFamilyHandle                 fencingHandle;

    private RocksDB                            db;

    private RocksDBOptions                     opts;
    private DBOptions                          options;
    private WriteOptions                       writeOptions;
    private DebugStatistics                    statistics;
    private RocksStatisticsCollector           statisticsCollector;

    @Override
    public boolean init(final RocksDBOptions opts) {
        final Lock writeLock = this.readWriteLock.writeLock();
        writeLock.lock();
        try {
            if (this.db != null) {
                LOG.info("[RocksRawKVStore] already started.");
                return true;
            }
            this.opts = opts;
            this.options = createDBOptions();
            if (opts.isOpenStatisticsCollector()) {
                this.statistics = new DebugStatistics();
                this.options.setStatistics(this.statistics);
                final long intervalSeconds = opts.getStatisticsCallbackIntervalSeconds();
                if (intervalSeconds > 0) {
                    this.statisticsCollector = new RocksStatisticsCollector(TimeUnit.SECONDS.toMillis(intervalSeconds));
                    this.statisticsCollector.start();
                }
            }
            final ColumnFamilyOptions cfOptions = createColumnFamilyOptions();
            this.cfOptionsList.add(cfOptions);
            // default column family
            this.cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOptions));
            // sequence column family
            this.cfDescriptors.add(new ColumnFamilyDescriptor(BytesUtil.writeUtf8("RHEA_SEQUENCE"), cfOptions));
            // locking column family
            this.cfDescriptors.add(new ColumnFamilyDescriptor(BytesUtil.writeUtf8("RHEA_LOCKING"), cfOptions));
            // fencing column family
            this.cfDescriptors.add(new ColumnFamilyDescriptor(BytesUtil.writeUtf8("RHEA_FENCING"), cfOptions));
            this.writeOptions = new WriteOptions();
            this.writeOptions.setSync(opts.isSync());
            // If `sync` is true, `disableWAL` must be set false.
            this.writeOptions.setDisableWAL(!opts.isSync() && opts.isDisableWAL());
            // Delete existing data, relying on raft's snapshot and log playback
            // to reply to the data is the correct behavior.
            destroyRocksDB(opts);
            openRocksDB(opts);
            this.shutdownLock.setMaxPermits(1);
            LOG.info("[RocksRawKVStore] start successfully, options: {}.", opts);
            return true;
        } catch (final Exception e) {
            LOG.error("Fail to open rocksDB at path {}, {}.", opts.getDbPath(), StackTraceUtil.stackTrace(e));
        } finally {
            writeLock.unlock();
        }
        return false;
    }

    @Override
    public void shutdown() {
        final Lock writeLock = this.readWriteLock.writeLock();
        writeLock.lock();
        try {
            if (this.db == null) {
                return;
            }
            this.shutdownLock.setMaxPermits(0);
            closeRocksDB();
            if (this.defaultHandle != null) {
                this.defaultHandle.close();
                this.defaultHandle = null;
            }
            if (this.sequenceHandle != null) {
                this.sequenceHandle.close();
                this.sequenceHandle = null;
            }
            if (this.lockingHandle != null) {
                this.lockingHandle.close();
                this.lockingHandle = null;
            }
            if (this.fencingHandle != null) {
                this.fencingHandle.close();
                this.fencingHandle = null;
            }
            for (final ColumnFamilyOptions cfOptions : this.cfOptionsList) {
                cfOptions.close();
            }
            this.cfOptionsList.clear();
            this.cfDescriptors.clear();
            if (this.options != null) {
                this.options.close();
                this.options = null;
            }
            if (this.statisticsCollector != null) {
                try {
                    this.statisticsCollector.shutdown(3000);
                } catch (final Throwable ignored) {
                    // ignored
                }
            }
            if (this.statistics != null) {
                this.statistics.close();
                this.statistics = null;
            }
            if (this.writeOptions != null) {
                this.writeOptions.close();
                this.writeOptions = null;
            }
        } finally {
            writeLock.unlock();
            LOG.info("[RocksRawKVStore] shutdown successfully.");
        }
    }

    @Override
    public KVIterator localIterator() {
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            return new RocksKVIterator(this, this.db.newIterator(), readLock, getDatabaseVersion());
        } finally {
            readLock.unlock();
        }
    }

    @Override
    public void get(final byte[] key, @SuppressWarnings("unused") final boolean readOnlySafe,
                    final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("GET");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] value = this.db.get(key);
            setSuccess(closure, value);
        } catch (final Exception e) {
            LOG.error("Fail to [GET], key: [{}], {}.", BytesUtil.toHex(key), StackTraceUtil.stackTrace(e));
            setFailure(closure, "Fail to [GET]");
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void multiGet(final List<byte[]> keys, @SuppressWarnings("unused") final boolean readOnlySafe,
                         final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("MULTI_GET");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final Map<byte[], byte[]> rawMap = this.db.multiGet(keys);
            final Map<ByteArray, byte[]> resultMap = Maps.newHashMapWithExpectedSize(rawMap.size());
            for (final Map.Entry<byte[], byte[]> entry : rawMap.entrySet()) {
                resultMap.put(ByteArray.wrap(entry.getKey()), entry.getValue());
            }
            setSuccess(closure, resultMap);
        } catch (final Exception e) {
            LOG.error("Fail to [MULTI_GET], key size: [{}], {}.", keys.size(), StackTraceUtil.stackTrace(e));
            setFailure(closure, "Fail to [MULTI_GET]");
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void containsKey(final byte[] key, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("CONTAINS_KEY");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            boolean exists = false;
            if (this.db.keyMayExist(key, new StringBuilder(0))) {
                exists = this.db.get(key) != null;
            }
            setSuccess(closure, exists);
        } catch (final Exception e) {
            LOG.error("Fail to [CONTAINS_KEY], key: [{}], {}.", BytesUtil.toHex(key), StackTraceUtil.stackTrace(e));
            setFailure(closure, "Fail to [CONTAINS_KEY]");
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void scan(final byte[] startKey, final byte[] endKey, final int limit,
                     @SuppressWarnings("unused") final boolean readOnlySafe, final boolean returnValue,
                     final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("SCAN");
        final List<KVEntry> entries = Lists.newArrayList();
        final int maxCount = normalizeLimit(limit);
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try (final RocksIterator it = this.db.newIterator()) {
            if (startKey == null) {
                it.seekToFirst();
            } else {
                it.seek(startKey);
            }
            int count = 0;
            while (it.isValid() && count++ < maxCount) {
                final byte[] key = it.key();
                if (endKey != null && BytesUtil.compare(key, endKey) >= 0) {
                    break;
                }
                entries.add(new KVEntry(key, returnValue ? it.value() : null));
                it.next();
            }
            setSuccess(closure, entries);
        } catch (final Exception e) {
            LOG.error("Fail to [SCAN], range: ['[{}, {})'], {}.", BytesUtil.toHex(startKey), BytesUtil.toHex(endKey),
                StackTraceUtil.stackTrace(e));
            setFailure(closure, "Fail to [SCAN]");
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void reverseScan(final byte[] startKey, final byte[] endKey, final int limit,
                            @SuppressWarnings("unused") final boolean readOnlySafe, final boolean returnValue,
                            final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("REVERSE_SCAN");
        final List<KVEntry> entries = Lists.newArrayList();
        int maxCount = normalizeLimit(limit);
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try (final RocksIterator it = this.db.newIterator()) {
            if (startKey == null) {
                it.seekToLast();
            } else {
                it.seekForPrev(startKey);
            }
            int count = 0;
            while (it.isValid() && count++ < maxCount) {
                final byte[] key = it.key();
                if (endKey != null && BytesUtil.compare(key, endKey) <= 0) {
                    break;
                }
                entries.add(new KVEntry(key, returnValue ? it.value() : null));
                it.prev();
            }
            setSuccess(closure, entries);
        } catch (final Exception e) {
            LOG.error("Fail to [REVERSE_SCAN], range: ['[{}, {})'], {}.", BytesUtil.toHex(startKey),
                BytesUtil.toHex(endKey), StackTraceUtil.stackTrace(e));
            setFailure(closure, "Fail to [REVERSE_SCAN]");
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void getSequence(final byte[] seqKey, final int step, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("GET_SEQUENCE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] prevBytesVal = this.db.get(this.sequenceHandle, seqKey);
            long startVal;
            if (prevBytesVal == null) {
                startVal = 0;
            } else {
                startVal = Bits.getLong(prevBytesVal, 0);
            }
            if (step < 0) {
                // never get here
                setFailure(closure, "Fail to [GET_SEQUENCE], step must >= 0");
                return;
            }
            if (step == 0) {
                setSuccess(closure, new Sequence(startVal, startVal));
                return;
            }
            final long endVal = getSafeEndValueForSequence(startVal, step);
            if (startVal != endVal) {
                final byte[] newBytesVal = new byte[8];
                Bits.putLong(newBytesVal, 0, endVal);
                this.db.put(this.sequenceHandle, this.writeOptions, seqKey, newBytesVal);
            }
            setSuccess(closure, new Sequence(startVal, endVal));
        } catch (final Exception e) {
            LOG.error("Fail to [GET_SEQUENCE], [key = {}, step = {}], {}.", BytesUtil.toHex(seqKey), step,
                StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [GET_SEQUENCE]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void resetSequence(final byte[] seqKey, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("RESET_SEQUENCE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            this.db.delete(this.sequenceHandle, seqKey);
            setSuccess(closure, Boolean.TRUE);
        } catch (final Exception e) {
            LOG.error("Fail to [RESET_SEQUENCE], [key = {}], {}.", BytesUtil.toHex(seqKey),
                StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [RESET_SEQUENCE]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void batchResetSequence(final KVStateOutputList kvStates) {
        if (kvStates.isSingletonList()) {
            final KVState kvState = kvStates.getSingletonElement();
            resetSequence(kvState.getOp().getKey(), kvState.getDone());
            return;
        }
        final Timer.Context timeCtx = getTimeContext("BATCH_RESET_SEQUENCE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            Partitions.manyToOne(kvStates, MAX_BATCH_WRITE_SIZE, (Function<List<KVState>, Void>) segment -> {
                try (final WriteBatch batch = new WriteBatch()) {
                    for (final KVState kvState : segment) {
                        batch.delete(sequenceHandle, kvState.getOp().getKey());
                    }
                    this.db.write(this.writeOptions, batch);
                    for (final KVState kvState : segment) {
                        setSuccess(kvState.getDone(), Boolean.TRUE);
                    }
                } catch (final Exception e) {
                    LOG.error("Failed to [BATCH_RESET_SEQUENCE], [size = {}], {}.", segment.size(),
                        StackTraceUtil.stackTrace(e));
                    setCriticalError(Lists.transform(kvStates, KVState::getDone), "Fail to [BATCH_RESET_SEQUENCE]", e);
                }
                return null;
            });
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void put(final byte[] key, final byte[] value, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("PUT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            this.db.put(this.writeOptions, key, value);
            setSuccess(closure, Boolean.TRUE);
        } catch (final Exception e) {
            LOG.error("Fail to [PUT], [{}, {}], {}.", BytesUtil.toHex(key), BytesUtil.toHex(value),
                StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [PUT]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void batchPut(final KVStateOutputList kvStates) {
        if (kvStates.isSingletonList()) {
            final KVState kvState = kvStates.getSingletonElement();
            final KVOperation op = kvState.getOp();
            put(op.getKey(), op.getValue(), kvState.getDone());
            return;
        }
        final Timer.Context timeCtx = getTimeContext("BATCH_PUT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            Partitions.manyToOne(kvStates, MAX_BATCH_WRITE_SIZE, (Function<List<KVState>, Void>) segment -> {
                try (final WriteBatch batch = new WriteBatch()) {
                    for (final KVState kvState : segment) {
                        final KVOperation op = kvState.getOp();
                        batch.put(op.getKey(), op.getValue());
                    }
                    this.db.write(this.writeOptions, batch);
                    for (final KVState kvState : segment) {
                        setSuccess(kvState.getDone(), Boolean.TRUE);
                    }
                } catch (final Exception e) {
                    LOG.error("Failed to [BATCH_PUT], [size = {}] {}.", segment.size(), StackTraceUtil.stackTrace(e));
                    setCriticalError(Lists.transform(kvStates, KVState::getDone), "Fail to [BATCH_PUT]", e);
                }
                return null;
            });
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void getAndPut(final byte[] key, final byte[] value, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("GET_PUT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] prevVal = this.db.get(key);
            this.db.put(this.writeOptions, key, value);
            setSuccess(closure, prevVal);
        } catch (final Exception e) {
            LOG.error("Fail to [GET_PUT], [{}, {}], {}.", BytesUtil.toHex(key), BytesUtil.toHex(value),
                StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [GET_PUT]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void batchGetAndPut(final KVStateOutputList kvStates) {
        if (kvStates.isSingletonList()) {
            final KVState kvState = kvStates.getSingletonElement();
            final KVOperation op = kvState.getOp();
            getAndPut(op.getKey(), op.getValue(), kvState.getDone());
            return;
        }
        final Timer.Context timeCtx = getTimeContext("BATCH_GET_PUT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            Partitions.manyToOne(kvStates, MAX_BATCH_WRITE_SIZE, (Function<List<KVState>, Void>) segment -> {
                try (final WriteBatch batch = new WriteBatch()) {
                    final List<byte[]> keys = Lists.newArrayListWithCapacity(segment.size());
                    for (final KVState kvState : segment) {
                        final KVOperation op = kvState.getOp();
                        final byte[] key = op.getKey();
                        keys.add(key);
                        batch.put(key, op.getValue());
                    }
                    // first, get prev values
                    final Map<byte[], byte[]> prevValMap = this.db.multiGet(keys);
                    this.db.write(this.writeOptions, batch);
                    for (final KVState kvState : segment) {
                        setSuccess(kvState.getDone(), prevValMap.get(kvState.getOp().getKey()));
                    }
                } catch (final Exception e) {
                    LOG.error("Failed to [BATCH_GET_PUT], [size = {}] {}.", segment.size(),
                        StackTraceUtil.stackTrace(e));
                    setCriticalError(Lists.transform(kvStates, KVState::getDone), "Fail to [BATCH_GET_PUT]", e);
                }
                return null;
            });
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void compareAndPut(final byte[] key, final byte[] expect, final byte[] update, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("COMPARE_PUT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] actual = this.db.get(key);
            if (Arrays.equals(expect, actual)) {
                this.db.put(this.writeOptions, key, update);
                setSuccess(closure, Boolean.TRUE);
            } else {
                setSuccess(closure, Boolean.FALSE);
            }
        } catch (final Exception e) {
            LOG.error("Fail to [COMPARE_PUT], [{}, {}, {}], {}.", BytesUtil.toHex(key), BytesUtil.toHex(expect),
                BytesUtil.toHex(update), StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [COMPARE_PUT]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void batchCompareAndPut(final KVStateOutputList kvStates) {
        if (kvStates.isSingletonList()) {
            final KVState kvState = kvStates.getSingletonElement();
            final KVOperation op = kvState.getOp();
            compareAndPut(op.getKey(), op.getExpect(), op.getValue(), kvState.getDone());
            return;
        }
        final Timer.Context timeCtx = getTimeContext("BATCH_COMPARE_PUT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            Partitions.manyToOne(kvStates, MAX_BATCH_WRITE_SIZE, (Function<List<KVState>, Void>) segment -> {
                try (final WriteBatch batch = new WriteBatch()) {
                    final Map<byte[], byte[]> expects = Maps.newHashMapWithExpectedSize(segment.size());
                    final Map<byte[], byte[]> updates = Maps.newHashMapWithExpectedSize(segment.size());
                    for (final KVState kvState : segment) {
                        final KVOperation op = kvState.getOp();
                        final byte[] key = op.getKey();
                        final byte[] expect = op.getExpect();
                        final byte[] update = op.getValue();
                        expects.put(key, expect);
                        updates.put(key, update);
                    }
                    final Map<byte[], byte[]> prevValMap = this.db.multiGet(Lists.newArrayList(expects.keySet()));
                    for (final KVState kvState : segment) {
                        final byte[] key = kvState.getOp().getKey();
                        if (Arrays.equals(expects.get(key), prevValMap.get(key))) {
                            batch.put(key, updates.get(key));
                            setData(kvState.getDone(), Boolean.TRUE);
                        } else {
                            setData(kvState.getDone(), Boolean.FALSE);
                        }
                    }
                    if (batch.count() > 0) {
                        this.db.write(this.writeOptions, batch);
                    }
                    for (final KVState kvState : segment) {
                        setSuccess(kvState.getDone(), getData(kvState.getDone()));
                    }
                } catch (final Exception e) {
                    LOG.error("Failed to [BATCH_COMPARE_PUT], [size = {}] {}.", segment.size(),
                        StackTraceUtil.stackTrace(e));
                    setCriticalError(Lists.transform(kvStates, KVState::getDone), "Fail to [BATCH_COMPARE_PUT]", e);
                }
                return null;
            });
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void merge(final byte[] key, final byte[] value, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("MERGE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            this.db.merge(this.writeOptions, key, value);
            setSuccess(closure, Boolean.TRUE);
        } catch (final Exception e) {
            LOG.error("Fail to [MERGE], [{}, {}], {}.", BytesUtil.toHex(key), BytesUtil.toHex(value),
                StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [MERGE]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void batchMerge(final KVStateOutputList kvStates) {
        if (kvStates.isSingletonList()) {
            final KVState kvState = kvStates.getSingletonElement();
            final KVOperation op = kvState.getOp();
            merge(op.getKey(), op.getValue(), kvState.getDone());
            return;
        }
        final Timer.Context timeCtx = getTimeContext("BATCH_MERGE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            Partitions.manyToOne(kvStates, MAX_BATCH_WRITE_SIZE, (Function<List<KVState>, Void>) segment -> {
                try (final WriteBatch batch = new WriteBatch()) {
                    for (final KVState kvState : segment) {
                        final KVOperation op = kvState.getOp();
                        batch.merge(op.getKey(), op.getValue());
                    }
                    this.db.write(this.writeOptions, batch);
                    for (final KVState kvState : segment) {
                        setSuccess(kvState.getDone(), Boolean.TRUE);
                    }
                } catch (final Exception e) {
                    LOG.error("Failed to [BATCH_MERGE], [size = {}] {}.", segment.size(), StackTraceUtil.stackTrace(e));
                    setCriticalError(Lists.transform(kvStates, KVState::getDone), "Fail to [BATCH_MERGE]", e);
                }
                return null;
            });
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void put(final List<KVEntry> entries, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("PUT_LIST");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try (final WriteBatch batch = new WriteBatch()) {
            for (final KVEntry entry : entries) {
                batch.put(entry.getKey(), entry.getValue());
            }
            this.db.write(this.writeOptions, batch);
            setSuccess(closure, Boolean.TRUE);
        } catch (final Exception e) {
            LOG.error("Failed to [PUT_LIST], [size = {}], {}.", entries.size(), StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [PUT_LIST]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void putIfAbsent(final byte[] key, final byte[] value, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("PUT_IF_ABSENT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] prevVal = this.db.get(key);
            if (prevVal == null) {
                this.db.put(this.writeOptions, key, value);
            }
            setSuccess(closure, prevVal);
        } catch (final Exception e) {
            LOG.error("Fail to [PUT_IF_ABSENT], [{}, {}], {}.", BytesUtil.toHex(key), BytesUtil.toHex(value),
                StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [PUT_IF_ABSENT]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void batchPutIfAbsent(final KVStateOutputList kvStates) {
        if (kvStates.isSingletonList()) {
            final KVState kvState = kvStates.getSingletonElement();
            final KVOperation op = kvState.getOp();
            putIfAbsent(op.getKey(), op.getValue(), kvState.getDone());
            return;
        }
        final Timer.Context timeCtx = getTimeContext("BATCH_PUT_IF_ABSENT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            Partitions.manyToOne(kvStates, MAX_BATCH_WRITE_SIZE, (Function<List<KVState>, Void>) segment -> {
                try (final WriteBatch batch = new WriteBatch()) {
                    final List<byte[]> keys = Lists.newArrayListWithCapacity(segment.size());
                    final Map<byte[], byte[]> values = Maps.newHashMapWithExpectedSize(segment.size());
                    for (final KVState kvState : segment) {
                        final KVOperation op = kvState.getOp();
                        final byte[] key = op.getKey();
                        final byte[] value = op.getValue();
                        keys.add(key);
                        values.put(key, value);
                    }
                    final Map<byte[], byte[]> prevValMap = this.db.multiGet(keys);
                    for (final KVState kvState : segment) {
                        final byte[] key = kvState.getOp().getKey();
                        final byte[] prevVal = prevValMap.get(key);
                        if (prevVal == null) {
                            batch.put(key, values.get(key));
                        }
                        setData(kvState.getDone(), prevVal);
                    }
                    if (batch.count() > 0) {
                        this.db.write(this.writeOptions, batch);
                    }
                    for (final KVState kvState : segment) {
                        setSuccess(kvState.getDone(), getData(kvState.getDone()));
                    }
                } catch (final Exception e) {
                    LOG.error("Failed to [BATCH_PUT_IF_ABSENT], [size = {}] {}.", segment.size(),
                        StackTraceUtil.stackTrace(e));
                    setCriticalError(Lists.transform(kvStates, KVState::getDone), "Fail to [BATCH_PUT_IF_ABSENT]", e);
                }
                return null;
            });
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void tryLockWith(final byte[] key, final byte[] fencingKey, final boolean keepLease,
                            final DistributedLock.Acquirer acquirer, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("TRY_LOCK");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            // The algorithm relies on the assumption that while there is no
            // synchronized clock across the processes, still the local time in
            // every process flows approximately at the same rate, with an error
            // which is small compared to the auto-release time of the lock.
            final long now = acquirer.getLockingTimestamp();
            final long timeoutMillis = acquirer.getLeaseMillis();
            final byte[] prevBytesVal = this.db.get(this.lockingHandle, key);

            final DistributedLock.Owner owner;
            // noinspection ConstantConditions
            do {
                final DistributedLock.OwnerBuilder builder = DistributedLock.newOwnerBuilder();
                if (prevBytesVal == null) {
                    // no others own this lock
                    if (keepLease) {
                        // it wants to keep the lease but too late, will return failure
                        owner = builder //
                            // set acquirer id
                            .id(acquirer.getId())
                            // fail to keep lease
                            .remainingMillis(DistributedLock.OwnerBuilder.KEEP_LEASE_FAIL)
                            // set failure
                            .success(false).build();
                        break;
                    }
                    // is first time to try lock (another possibility is that this lock has been deleted),
                    // will return successful
                    owner = builder //
                        // set acquirer id, now it will own the lock
                        .id(acquirer.getId())
                        // set a new deadline
                        .deadlineMillis(now + timeoutMillis)
                        // first time to acquire and success
                        .remainingMillis(DistributedLock.OwnerBuilder.FIRST_TIME_SUCCESS)
                        // create a new fencing token
                        .fencingToken(getNextFencingToken(fencingKey))
                        // init acquires
                        .acquires(1)
                        // set acquirer ctx
                        .context(acquirer.getContext())
                        // set successful
                        .success(true).build();
                    this.db.put(this.lockingHandle, this.writeOptions, key, this.serializer.writeObject(owner));
                    break;
                }

                // this lock has an owner, check if it has expired
                final DistributedLock.Owner prevOwner = this.serializer.readObject(prevBytesVal,
                    DistributedLock.Owner.class);
                final long remainingMillis = prevOwner.getDeadlineMillis() - now;
                if (remainingMillis < 0) {
                    // the previous owner is out of lease
                    if (keepLease) {
                        // it wants to keep the lease but too late, will return failure
                        owner = builder //
                            // still previous owner id
                            .id(prevOwner.getId())
                            // do not update
                            .deadlineMillis(prevOwner.getDeadlineMillis())
                            // fail to keep lease
                            .remainingMillis(DistributedLock.OwnerBuilder.KEEP_LEASE_FAIL)
                            // set previous ctx
                            .context(prevOwner.getContext())
                            // set failure
                            .success(false).build();
                        break;
                    }
                    // create new lock owner
                    owner = builder //
                        // set acquirer id, now it will own the lock
                        .id(acquirer.getId())
                        // set a new deadline
                        .deadlineMillis(now + timeoutMillis)
                        // success as a new acquirer
                        .remainingMillis(DistributedLock.OwnerBuilder.NEW_ACQUIRE_SUCCESS)
                        // create a new fencing token
                        .fencingToken(getNextFencingToken(fencingKey))
                        // init acquires
                        .acquires(1)
                        // set acquirer ctx
                        .context(acquirer.getContext())
                        // set successful
                        .success(true).build();
                    this.db.put(this.lockingHandle, this.writeOptions, key, this.serializer.writeObject(owner));
                    break;
                }

                // the previous owner is not out of lease (remainingMillis >= 0)
                final boolean isReentrant = prevOwner.isSameAcquirer(acquirer);
                if (isReentrant) {
                    // is the same old friend come back (reentrant lock)
                    if (keepLease) {
                        // the old friend only wants to keep lease of lock
                        owner = builder //
                            // still previous owner id
                            .id(prevOwner.getId())
                            // update the deadline to keep lease
                            .deadlineMillis(now + timeoutMillis)
                            // success to keep lease
                            .remainingMillis(DistributedLock.OwnerBuilder.KEEP_LEASE_SUCCESS)
                            // keep fencing token
                            .fencingToken(prevOwner.getFencingToken())
                            // keep acquires
                            .acquires(prevOwner.getAcquires())
                            // do not update ctx when keeping lease
                            .context(prevOwner.getContext())
                            // set successful
                            .success(true).build();
                        this.db.put(this.lockingHandle, this.writeOptions, key, this.serializer.writeObject(owner));
                        break;
                    }
                    // now we are sure that is an old friend who is back again (reentrant lock)
                    owner = builder //
                        // still previous owner id
                        .id(prevOwner.getId())
                        // by the way, the lease will also be kept
                        .deadlineMillis(now + timeoutMillis)
                        // success reentrant
                        .remainingMillis(DistributedLock.OwnerBuilder.REENTRANT_SUCCESS)
                        // keep fencing token
                        .fencingToken(prevOwner.getFencingToken())
                        // acquires++
                        .acquires(prevOwner.getAcquires() + 1)
                        // update ctx when reentrant
                        .context(acquirer.getContext())
                        // set successful
                        .success(true).build();
                    this.db.put(this.lockingHandle, this.writeOptions, key, this.serializer.writeObject(owner));
                    break;
                }

                // the lock is exist and also prev locker is not the same as current
                owner = builder //
                    // set previous owner id to tell who is the real owner
                    .id(prevOwner.getId())
                    // set the remaining lease time of current owner
                    .remainingMillis(remainingMillis)
                    // set previous ctx
                    .context(prevOwner.getContext())
                    // set failure
                    .success(false).build();
                LOG.debug("Another locker [{}] is trying the existed lock [{}].", acquirer, prevOwner);
            } while (false);

            setSuccess(closure, owner);
        } catch (final Exception e) {
            LOG.error("Fail to [TRY_LOCK], [{}, {}], {}.", BytesUtil.toHex(key), acquirer, StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [TRY_LOCK]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void releaseLockWith(final byte[] key, final DistributedLock.Acquirer acquirer, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("RELEASE_LOCK");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] prevBytesVal = this.db.get(this.lockingHandle, key);

            final DistributedLock.Owner owner;
            // noinspection ConstantConditions
            do {
                final DistributedLock.OwnerBuilder builder = DistributedLock.newOwnerBuilder();
                if (prevBytesVal == null) {
                    LOG.warn("Lock not exist: {}.", acquirer);
                    owner = builder //
                        // set acquirer id
                        .id(acquirer.getId())
                        // set acquirer fencing token
                        .fencingToken(acquirer.getFencingToken())
                        // set acquires=0
                        .acquires(0)
                        // set successful
                        .success(true).build();
                    break;
                }

                final DistributedLock.Owner prevOwner = this.serializer.readObject(prevBytesVal,
                    DistributedLock.Owner.class);

                if (prevOwner.isSameAcquirer(acquirer)) {
                    final long acquires = prevOwner.getAcquires() - 1;
                    owner = builder //
                        // still previous owner id
                        .id(prevOwner.getId())
                        // do not update deadline
                        .deadlineMillis(prevOwner.getDeadlineMillis())
                        // keep fencing token
                        .fencingToken(prevOwner.getFencingToken())
                        // acquires--
                        .acquires(acquires)
                        // set previous ctx
                        .context(prevOwner.getContext())
                        // set successful
                        .success(true).build();
                    if (acquires <= 0) {
                        // real delete, goodbye ~
                        this.db.delete(this.lockingHandle, this.writeOptions, key);
                    } else {
                        // acquires--
                        this.db.put(this.lockingHandle, this.writeOptions, key, this.serializer.writeObject(owner));
                    }
                    break;
                }

                // invalid acquirer, can't to release the lock
                owner = builder //
                    // set previous owner id to tell who is the real owner
                    .id(prevOwner.getId())
                    // keep previous fencing token
                    .fencingToken(prevOwner.getFencingToken())
                    // do not update acquires
                    .acquires(prevOwner.getAcquires())
                    // set previous ctx
                    .context(prevOwner.getContext())
                    // set failure
                    .success(false).build();
                LOG.warn("The lock owner is: [{}], [{}] could't release it.", prevOwner, acquirer);
            } while (false);

            setSuccess(closure, owner);
        } catch (final Exception e) {
            LOG.error("Fail to [RELEASE_LOCK], [{}], {}.", BytesUtil.toHex(key), StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [RELEASE_LOCK]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    private long getNextFencingToken(final byte[] fencingKey) throws RocksDBException {
        final Timer.Context timeCtx = getTimeContext("FENCING_TOKEN");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] realKey = BytesUtil.nullToEmpty(fencingKey);
            final byte[] prevBytesVal = this.db.get(this.fencingHandle, realKey);
            final long prevVal;
            if (prevBytesVal == null) {
                prevVal = 0; // init
            } else {
                prevVal = Bits.getLong(prevBytesVal, 0);
            }
            // Don't worry about the token number overflow.
            // It takes about 290,000 years for the 1 million TPS system
            // to use the numbers in the range [0 ~ Long.MAX_VALUE].
            final long newVal = prevVal + 1;
            final byte[] newBytesVal = new byte[8];
            Bits.putLong(newBytesVal, 0, newVal);
            this.db.put(this.fencingHandle, this.writeOptions, realKey, newBytesVal);
            return newVal;
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void delete(final byte[] key, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("DELETE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            this.db.delete(this.writeOptions, key);
            setSuccess(closure, Boolean.TRUE);
        } catch (final Exception e) {
            LOG.error("Fail to [DELETE], [{}], {}.", BytesUtil.toHex(key), StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [DELETE]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void batchDelete(final KVStateOutputList kvStates) {
        if (kvStates.isSingletonList()) {
            final KVState kvState = kvStates.getSingletonElement();
            delete(kvState.getOp().getKey(), kvState.getDone());
            return;
        }
        final Timer.Context timeCtx = getTimeContext("BATCH_DELETE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            Partitions.manyToOne(kvStates, MAX_BATCH_WRITE_SIZE, (Function<List<KVState>, Void>) segment -> {
                try (final WriteBatch batch = new WriteBatch()) {
                    for (final KVState kvState : segment) {
                        batch.delete(kvState.getOp().getKey());
                    }
                    this.db.write(this.writeOptions, batch);
                    for (final KVState kvState : segment) {
                        setSuccess(kvState.getDone(), Boolean.TRUE);
                    }
                } catch (final Exception e) {
                    LOG.error("Failed to [BATCH_DELETE], [size = {}], {}.", segment.size(),
                        StackTraceUtil.stackTrace(e));
                    setCriticalError(Lists.transform(kvStates, KVState::getDone), "Fail to [BATCH_DELETE]", e);
                }
                return null;
            });
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void deleteRange(final byte[] startKey, final byte[] endKey, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("DELETE_RANGE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            this.db.deleteRange(this.writeOptions, startKey, endKey);
            setSuccess(closure, Boolean.TRUE);
        } catch (final Exception e) {
            LOG.error("Fail to [DELETE_RANGE], ['[{}, {})'], {}.", BytesUtil.toHex(startKey), BytesUtil.toHex(endKey),
                StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [DELETE_RANGE]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void delete(final List<byte[]> keys, final KVStoreClosure closure) {
        final Timer.Context timeCtx = getTimeContext("DELETE_LIST");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try (final WriteBatch batch = new WriteBatch()) {
            for (final byte[] key : keys) {
                batch.delete(key);
            }
            this.db.write(this.writeOptions, batch);
            setSuccess(closure, Boolean.TRUE);
        } catch (final Exception e) {
            LOG.error("Failed to [DELETE_LIST], [size = {}], {}.", keys.size(), StackTraceUtil.stackTrace(e));
            setCriticalError(closure, "Fail to [DELETE_LIST]", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public long getApproximateKeysInRange(final byte[] startKey, final byte[] endKey) {
        // TODO This is a sad code, the performance is too damn bad
        final Timer.Context timeCtx = getTimeContext("APPROXIMATE_KEYS");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        final Snapshot snapshot = this.db.getSnapshot();
        try (final ReadOptions readOptions = new ReadOptions()) {
            readOptions.setSnapshot(snapshot);
            try (final RocksIterator it = this.db.newIterator(readOptions)) {
                if (startKey == null) {
                    it.seekToFirst();
                } else {
                    it.seek(startKey);
                }
                long approximateKeys = 0;
                for (;;) {
                    // The accuracy is 100, don't ask more
                    for (int i = 0; i < 100; i++) {
                        if (!it.isValid()) {
                            return approximateKeys;
                        }
                        it.next();
                        ++approximateKeys;
                    }
                    if (endKey != null && BytesUtil.compare(it.key(), endKey) >= 0) {
                        return approximateKeys;
                    }
                }
            }
        } finally {
            // Nothing to release, rocksDB never own the pointer for a snapshot.
            snapshot.close();
            // The pointer to the snapshot is released by the database instance.
            this.db.releaseSnapshot(snapshot);
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public byte[] jumpOver(final byte[] startKey, final long distance) {
        final Timer.Context timeCtx = getTimeContext("JUMP_OVER");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        final Snapshot snapshot = this.db.getSnapshot();
        try (final ReadOptions readOptions = new ReadOptions()) {
            readOptions.setSnapshot(snapshot);
            try (final RocksIterator it = this.db.newIterator(readOptions)) {
                if (startKey == null) {
                    it.seekToFirst();
                } else {
                    it.seek(startKey);
                }
                long approximateKeys = 0;
                for (;;) {
                    byte[] lastKey = null;
                    if (it.isValid()) {
                        lastKey = it.key();
                    }
                    // The accuracy is 100, don't ask more
                    for (int i = 0; i < 100; i++) {
                        if (!it.isValid()) {
                            return lastKey;
                        }
                        it.next();
                        if (++approximateKeys >= distance) {
                            return it.key();
                        }
                    }
                }
            }
        } finally {
            // Nothing to release, rocksDB never own the pointer for a snapshot.
            snapshot.close();
            // The pointer to the snapshot is released by the database instance.
            this.db.releaseSnapshot(snapshot);
            readLock.unlock();
            timeCtx.stop();
        }
    }

    @Override
    public void initFencingToken(final byte[] parentKey, final byte[] childKey) {
        final Timer.Context timeCtx = getTimeContext("INIT_FENCING_TOKEN");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final byte[] realKey = BytesUtil.nullToEmpty(parentKey);
            final byte[] parentBytesVal = this.db.get(this.fencingHandle, realKey);
            if (parentBytesVal == null) {
                return;
            }
            this.db.put(this.fencingHandle, this.writeOptions, childKey, parentBytesVal);
        } catch (final RocksDBException e) {
            throw new StorageException("Fail to init fencing token.", e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    public long getDatabaseVersion() {
        return this.databaseVersion.get();
    }

    public void addStatisticsCollectorCallback(final StatisticsCollectorCallback callback) {
        final RocksStatisticsCollector collector = Requires.requireNonNull(this.statisticsCollector,
            "statisticsCollector");
        final Statistics statistics = Requires.requireNonNull(this.statistics, "statistics");
        collector.addStatsCollectorInput(new StatsCollectorInput(statistics, callback));
    }

    boolean isFastSnapshot() {
        return Requires.requireNonNull(this.opts, "opts").isFastSnapshot();
    }

    boolean isAsyncSnapshot() {
        return Requires.requireNonNull(this.opts, "opts").isAsyncSnapshot();
    }

    CompletableFuture<Void> createSstFiles(final EnumMap<SstColumnFamily, File> sstFileTable, final byte[] startKey,
                                           final byte[] endKey, final ExecutorService executor) {
        final Snapshot snapshot;
        final CompletableFuture<Void> sstFuture = new CompletableFuture<>();
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            snapshot = this.db.getSnapshot();
            if (!isAsyncSnapshot()) {
                doCreateSstFiles(snapshot, sstFileTable, startKey, endKey, sstFuture);
                return sstFuture;
            }
        } finally {
            readLock.unlock();
        }

        // async snapshot
        executor.execute(() -> doCreateSstFiles(snapshot, sstFileTable, startKey, endKey, sstFuture));
        return sstFuture;
    }

    void doCreateSstFiles(final Snapshot snapshot, final EnumMap<SstColumnFamily, File> sstFileTable,
                          final byte[] startKey, final byte[] endKey, final CompletableFuture<Void> future) {
        final Timer.Context timeCtx = getTimeContext("CREATE_SST_FILE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            if (!this.shutdownLock.isAvailable()) {
                // KV store has shutdown, we do not release rocksdb's snapshot
                future.completeExceptionally(new StorageException("KV store has shutdown."));
                return;
            }
            try (final ReadOptions readOptions = new ReadOptions();
                    final EnvOptions envOptions = new EnvOptions();
                    final Options options = new Options().setMergeOperator(new StringAppendOperator())) {
                readOptions.setSnapshot(snapshot);
                for (final Map.Entry<SstColumnFamily, File> entry : sstFileTable.entrySet()) {
                    final SstColumnFamily sstColumnFamily = entry.getKey();
                    final File sstFile = entry.getValue();
                    final ColumnFamilyHandle columnFamilyHandle = findColumnFamilyHandle(sstColumnFamily);
                    try (final RocksIterator it = this.db.newIterator(columnFamilyHandle, readOptions);
                            final SstFileWriter sstFileWriter = new SstFileWriter(envOptions, options)) {
                        if (startKey == null) {
                            it.seekToFirst();
                        } else {
                            it.seek(startKey);
                        }
                        sstFileWriter.open(sstFile.getAbsolutePath());
                        long count = 0;
                        for (;;) {
                            if (!it.isValid()) {
                                break;
                            }
                            final byte[] key = it.key();
                            if (endKey != null && BytesUtil.compare(key, endKey) >= 0) {
                                break;
                            }
                            sstFileWriter.put(key, it.value());
                            ++count;
                            it.next();
                        }
                        if (count == 0) {
                            sstFileWriter.close();
                        } else {
                            sstFileWriter.finish();
                        }
                        LOG.info("Finish sst file {} with {} keys.", sstFile, count);
                    } catch (final RocksDBException e) {
                        throw new StorageException("Fail to create sst file at path: " + sstFile, e);
                    }
                }
                future.complete(null);
            } catch (final Throwable t) {
                future.completeExceptionally(t);
            } finally {
                // Nothing to release, rocksDB never own the pointer for a snapshot.
                snapshot.close();
                // The pointer to the snapshot is released by the database instance.
                this.db.releaseSnapshot(snapshot);
            }
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    void ingestSstFiles(final EnumMap<SstColumnFamily, File> sstFileTable) {
        final Timer.Context timeCtx = getTimeContext("INGEST_SST_FILE");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            for (final Map.Entry<SstColumnFamily, File> entry : sstFileTable.entrySet()) {
                final SstColumnFamily sstColumnFamily = entry.getKey();
                final File sstFile = entry.getValue();
                final ColumnFamilyHandle columnFamilyHandle = findColumnFamilyHandle(sstColumnFamily);
                try (final IngestExternalFileOptions ingestOptions = new IngestExternalFileOptions()) {
                    if (FileUtils.sizeOf(sstFile) == 0L) {
                        return;
                    }
                    final String filePath = sstFile.getAbsolutePath();
                    LOG.info("Start ingest sst file {}.", filePath);
                    this.db.ingestExternalFile(columnFamilyHandle, Collections.singletonList(filePath), ingestOptions);
                } catch (final RocksDBException e) {
                    throw new StorageException("Fail to ingest sst file at path: " + sstFile, e);
                }
            }
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    RocksDBBackupInfo backupDB(final String backupDBPath) throws IOException {
        final Timer.Context timeCtx = getTimeContext("BACKUP_DB");
        FileUtils.forceMkdir(new File(backupDBPath));
        final Lock writeLock = this.readWriteLock.writeLock();
        writeLock.lock();
        try (final BackupableDBOptions backupOpts = createBackupDBOptions(backupDBPath);
             final BackupEngine backupEngine = BackupEngine.open(this.options.getEnv(), backupOpts)) {
            backupEngine.createNewBackup(this.db, true);
            final List<BackupInfo> backupInfoList = backupEngine.getBackupInfo();
            if (backupInfoList.isEmpty()) {
                LOG.warn("Fail to backup at {}, empty backup info.", backupDBPath);
                return null;
            }
            // chose the backupInfo who has max backupId
            final BackupInfo backupInfo = Collections.max(backupInfoList, Comparator.comparingInt(BackupInfo::backupId));
            final RocksDBBackupInfo rocksBackupInfo = new RocksDBBackupInfo(backupInfo);
            LOG.info("Backup rocksDB into {} with backupInfo {}.", backupDBPath, rocksBackupInfo);
            return rocksBackupInfo;
        } catch (final RocksDBException e) {
            throw new StorageException("Fail to backup at path: " + backupDBPath, e);
        } finally {
            writeLock.unlock();
            timeCtx.stop();
        }
    }

    void restoreBackup(final String backupDBPath, final RocksDBBackupInfo rocksBackupInfo) {
        final Timer.Context timeCtx = getTimeContext("RESTORE_BACKUP");
        final Lock writeLock = this.readWriteLock.writeLock();
        writeLock.lock();
        closeRocksDB();
        try (final BackupableDBOptions backupOpts = createBackupDBOptions(backupDBPath);
                final BackupEngine backupEngine = BackupEngine.open(this.options.getEnv(), backupOpts);
                final RestoreOptions restoreOpts = new RestoreOptions(false)) {
            final String dbPath = this.opts.getDbPath();
            backupEngine.restoreDbFromBackup(rocksBackupInfo.getBackupId(), dbPath, dbPath, restoreOpts);
            LOG.info("Restored rocksDB from {} with {}.", backupDBPath, rocksBackupInfo);
            // reopen the db
            openRocksDB(this.opts);
        } catch (final RocksDBException e) {
            throw new StorageException("Fail to restore from path: " + backupDBPath, e);
        } finally {
            writeLock.unlock();
            timeCtx.stop();
        }
    }

    void writeSnapshot(final String snapshotPath) {
        final Timer.Context timeCtx = getTimeContext("WRITE_SNAPSHOT");
        final Lock writeLock = this.readWriteLock.writeLock();
        writeLock.lock();
        try (final Checkpoint checkpoint = Checkpoint.create(this.db)) {
            final String tempPath = snapshotPath + "_temp";
            final File tempFile = new File(tempPath);
            FileUtils.deleteDirectory(tempFile);
            checkpoint.createCheckpoint(tempPath);
            final File snapshotFile = new File(snapshotPath);
            FileUtils.deleteDirectory(snapshotFile);
            if (!tempFile.renameTo(snapshotFile)) {
                throw new StorageException("Fail to rename [" + tempPath + "] to [" + snapshotPath + "].");
            }
        } catch (final StorageException e) {
            throw e;
        } catch (final Exception e) {
            throw new StorageException("Fail to write snapshot at path: " + snapshotPath, e);
        } finally {
            writeLock.unlock();
            timeCtx.stop();
        }
    }

    void readSnapshot(final String snapshotPath) {
        final Timer.Context timeCtx = getTimeContext("READ_SNAPSHOT");
        final Lock writeLock = this.readWriteLock.writeLock();
        writeLock.lock();
        try {
            final File snapshotFile = new File(snapshotPath);
            if (!snapshotFile.exists()) {
                LOG.error("Snapshot file [{}] not exists.", snapshotPath);
                return;
            }
            closeRocksDB();
            final String dbPath = this.opts.getDbPath();
            final File dbFile = new File(dbPath);
            FileUtils.deleteDirectory(dbFile);
            if (!snapshotFile.renameTo(dbFile)) {
                throw new StorageException("Fail to rename [" + snapshotPath + "] to [" + dbPath + "].");
            }
            // reopen the db
            openRocksDB(this.opts);
        } catch (final Exception e) {
            throw new StorageException("Fail to read snapshot from path: " + snapshotPath, e);
        } finally {
            writeLock.unlock();
            timeCtx.stop();
        }
    }

    CompletableFuture<Void> writeSstSnapshot(final String snapshotPath, final Region region, final ExecutorService executor) {
        final Timer.Context timeCtx = getTimeContext("WRITE_SST_SNAPSHOT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final String tempPath = snapshotPath + "_temp";
            final File tempFile = new File(tempPath);
            FileUtils.deleteDirectory(tempFile);
            FileUtils.forceMkdir(tempFile);

            final EnumMap<SstColumnFamily, File> sstFileTable = getSstFileTable(tempPath);
            final CompletableFuture<Void> snapshotFuture = new CompletableFuture<>();
            final CompletableFuture<Void> sstFuture = createSstFiles(sstFileTable, region.getStartKey(),
                region.getEndKey(), executor);
            sstFuture.whenComplete((aVoid, throwable) -> {
                if (throwable == null) {
                    try {
                        final File snapshotFile = new File(snapshotPath);
                        FileUtils.deleteDirectory(snapshotFile);
                        if (!tempFile.renameTo(snapshotFile)) {
                            throw new StorageException("Fail to rename [" + tempPath + "] to [" + snapshotPath + "].");
                        }
                        snapshotFuture.complete(null);
                    } catch (final Throwable t) {
                        snapshotFuture.completeExceptionally(t);
                    }
                } else {
                    snapshotFuture.completeExceptionally(throwable);
                }
            });
            return snapshotFuture;
        } catch (final Exception e) {
            throw new StorageException("Fail to do read sst snapshot at path: " + snapshotPath, e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    void readSstSnapshot(final String snapshotPath) {
        final Timer.Context timeCtx = getTimeContext("READ_SST_SNAPSHOT");
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            final EnumMap<SstColumnFamily, File> sstFileTable = getSstFileTable(snapshotPath);
            ingestSstFiles(sstFileTable);
        } catch (final Exception e) {
            throw new StorageException("Fail to write sst snapshot at path: " + snapshotPath, e);
        } finally {
            readLock.unlock();
            timeCtx.stop();
        }
    }

    private EnumMap<SstColumnFamily, File> getSstFileTable(final String path) {
        final EnumMap<SstColumnFamily, File> sstFileTable = new EnumMap<>(SstColumnFamily.class);
        sstFileTable.put(SstColumnFamily.DEFAULT, Paths.get(path, "default.sst").toFile());
        sstFileTable.put(SstColumnFamily.SEQUENCE, Paths.get(path, "sequence.sst").toFile());
        sstFileTable.put(SstColumnFamily.LOCKING, Paths.get(path, "locking.sst").toFile());
        sstFileTable.put(SstColumnFamily.FENCING, Paths.get(path, "fencing.sst").toFile());
        return sstFileTable;
    }

    private ColumnFamilyHandle findColumnFamilyHandle(final SstColumnFamily sstColumnFamily) {
        switch (sstColumnFamily) {
            case DEFAULT:
                return this.defaultHandle;
            case SEQUENCE:
                return this.sequenceHandle;
            case LOCKING:
                return this.lockingHandle;
            case FENCING:
                return this.fencingHandle;
            default:
                throw new IllegalArgumentException("illegal sstColumnFamily: " + sstColumnFamily.name());
        }
    }

    private void openRocksDB(final RocksDBOptions opts) throws RocksDBException {
        final List<ColumnFamilyHandle> cfHandles = Lists.newArrayList();
        this.databaseVersion.incrementAndGet();
        this.db = RocksDB.open(this.options, opts.getDbPath(), this.cfDescriptors, cfHandles);
        this.defaultHandle = cfHandles.get(0);
        this.sequenceHandle = cfHandles.get(1);
        this.lockingHandle = cfHandles.get(2);
        this.fencingHandle = cfHandles.get(3);
    }

    private void closeRocksDB() {
        if (this.db != null) {
            this.db.close();
            this.db = null;
        }
    }

    private void destroyRocksDB(final RocksDBOptions opts) throws RocksDBException {
        // The major difference with directly deleting the DB directory manually is that
        // DestroyDB() will take care of the case where the RocksDB database is stored
        // in multiple directories. For instance, a single DB can be configured to store
        // its data in multiple directories by specifying different paths to
        // DBOptions::db_paths, DBOptions::db_log_dir, and DBOptions::wal_dir.
        try (final Options opt = new Options()) {
            RocksDB.destroyDB(opts.getDbPath(), opt);
        }
    }

    // Creates the rocksDB options, the user must take care
    // to close it after closing db.
    private static DBOptions createDBOptions() {
        return StorageOptionsFactory.getRocksDBOptions(RocksRawKVStore.class) //
            .setEnv(Env.getDefault());
    }

    // Creates the column family options to control the behavior
    // of a database.
    private static ColumnFamilyOptions createColumnFamilyOptions() {
        final BlockBasedTableConfig tConfig = StorageOptionsFactory.getRocksDBTableFormatConfig(RocksRawKVStore.class);
        return StorageOptionsFactory.getRocksDBColumnFamilyOptions(RocksRawKVStore.class) //
            .setTableFormatConfig(tConfig) //
            .setMergeOperator(new StringAppendOperator());
    }

    // Creates the backupable db options to control the behavior of
    // a backupable database.
    private static BackupableDBOptions createBackupDBOptions(final String backupDBPath) {
        return new BackupableDBOptions(backupDBPath) //
            .setSync(true) //
            .setShareTableFiles(false); // don't share data between backups
    }

    @Override
    public void describe(final Printer out) {
        final Lock readLock = this.readWriteLock.readLock();
        readLock.lock();
        try {
            if (this.db != null) {
                out.println(this.db.getProperty("rocksdb.stats"));
            }
            out.println("");
            if (this.statistics != null) {
                out.println(this.statistics.getString());
            }
        } catch (final RocksDBException e) {
            out.println(e);
        } finally {
            readLock.unlock();
        }
    }
}