/* * Copyright (c) 2017 Baidu, Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elasticsearch.index.engine; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import org.apache.lucene.index.AbortingException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexCallBack; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LiveIndexWriterConfig; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.SegmentCommitInfo; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter.IndexReaderWarmer; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.SearcherFactory; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.admin.cluster.lease.GetOrChangePrimaryShardLeaseRequest; import org.elasticsearch.action.admin.cluster.lease.GetOrChangePrimaryShardLeaseResponse; import org.elasticsearch.action.admin.cluster.lease.TransportGetOrChangePrimaryShardLeaseAction; import org.elasticsearch.action.admin.indices.shard.IndexShardStatsRequest; import org.elasticsearch.action.admin.indices.shard.IndexShardStatsResponse; import org.elasticsearch.action.admin.indices.shard.TransportIndexShardStatsAction; import org.elasticsearch.action.admin.indices.stats.CommonStats; import org.elasticsearch.action.admin.indices.stats.ShardStats; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.routing.DjbHashFunction; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.lease.Releasable; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.lucene.LoggerInfoStream; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.index.ElasticsearchDirectoryReader; import org.elasticsearch.common.lucene.index.ElasticsearchLeafReader; import org.elasticsearch.common.lucene.uid.Versions; import org.elasticsearch.common.math.MathUtils; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.AbstractRunnable; import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException; import org.elasticsearch.common.util.concurrent.ReleasableLock; import org.elasticsearch.index.deletionpolicy.SnapshotIndexCommit; import org.elasticsearch.index.indexing.ShardIndexingService; import org.elasticsearch.index.mapper.Uid; import org.elasticsearch.index.merge.MergeStats; import org.elasticsearch.index.merge.OnGoingMerge; import org.elasticsearch.index.shard.DLStats; import org.elasticsearch.index.shard.ElasticsearchMergePolicy; import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.index.shard.MergeSchedulerConfig; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.shard.TranslogRecoveryPerformer; import org.elasticsearch.index.translog.Translog; import org.elasticsearch.index.translog.TranslogCorruptedException; import org.elasticsearch.index.translog.dl.DistributedTranslog; import org.elasticsearch.indices.IndicesWarmer; import org.elasticsearch.threadpool.ThreadPool; import org.apache.distributedlog.DLSN; import org.apache.distributedlog.LogRecordWithDLSN; import org.apache.distributedlog.util.FutureUtils; import com.twitter.util.Duration; import com.twitter.util.Future; import com.twitter.util.FutureEventListener; public class DLBasedEngine extends Engine { /** * When we last pruned expired tombstones from versionMap.deletes: */ private volatile long lastDeleteVersionPruneTimeMSec; private final ShardIndexingService indexingService; @Nullable private final IndicesWarmer warmer; private DistributedTranslog dlTranslog; private final ElasticsearchConcurrentMergeScheduler mergeScheduler; private IndexWriter indexWriter; private SearcherFactory searcherFactory; private SearcherManager searcherManager; private final Lock flushLock = new ReentrantLock(); private final ReentrantLock optimizeLock = new ReentrantLock(); private final Lock resourceLock = new ReentrantLock(); private final LiveVersionMap versionMap; private final Object[] dirtyLocks; private final AtomicBoolean versionMapRefreshPending = new AtomicBoolean(); private volatile SegmentInfos lastCommittedSegmentInfos; private final IndexThrottle throttle; private ThreadPool threadPool; private TransportGetOrChangePrimaryShardLeaseAction checkLeaseAction; private TransportIndexShardStatsAction indexShardStatsAction; private IndexShard indexShard; private boolean isPrimaryRecoverFinished; private boolean isPrimaryInRouting; private DLSN latestDlsn; private AtomicLong latestTxid; private DLSN commitDlsn; private long commitTxid; private ReplayDLOperationTask recoverFromDLTask; private long leaseExpireTime; private long lastLeaseSendTime; private String localNodeId; private long dlWriteTimeoutMillis; private boolean isSnapshotRecoveryFinished; private CheckLeaseTask checkLeaseTask; public DLBasedEngine(EngineConfig engineConfig, boolean skipInitialTranslogRecovery, String nodeId, TransportGetOrChangePrimaryShardLeaseAction checkLeaseAction, TransportIndexShardStatsAction indexShardStatsAction, IndexShard indexShard) { super(engineConfig); this.dlWriteTimeoutMillis = 6000; this.indexShard = indexShard; this.isSnapshotRecoveryFinished = false; this.logger.debug("begin to created new DLBasedEngine"); this.isPrimaryRecoverFinished = false; this.isPrimaryInRouting = false; this.lastLeaseSendTime = 0; this.localNodeId = nodeId; this.recoverFromDLTask = new ReplayDLOperationTask(); this.leaseExpireTime = engineConfig.getIndexSettings().getAsInt("index.primary_lease_expire_seconds", 3) * 1000; this.checkLeaseAction = checkLeaseAction; this.indexShardStatsAction = indexShardStatsAction; this.latestDlsn = DLSN.InitialDLSN; this.latestTxid = new AtomicLong(0); this.commitDlsn = DLSN.InitialDLSN; this.commitTxid = 0L; this.versionMap = new LiveVersionMap(); this.threadPool = engineConfig.getThreadPool(); this.store.incRef(); IndexWriter writer = null; Translog translog = null; SearcherManager manager = null; EngineMergeScheduler scheduler = null; boolean success = false; try { this.lastDeleteVersionPruneTimeMSec = engineConfig.getThreadPool().estimatedTimeInMillis(); this.indexingService = engineConfig.getIndexingService(); this.warmer = engineConfig.getWarmer(); this.mergeScheduler = scheduler = new EngineMergeScheduler(engineConfig.getShardId(), engineConfig.getIndexSettings(), engineConfig.getMergeSchedulerConfig()); this.dirtyLocks = new Object[Runtime.getRuntime().availableProcessors() * 10]; // we multiply it to have enough... for (int i = 0; i < this.dirtyLocks.length; i++) { this.dirtyLocks[i] = new Object(); } boolean isCreateIndex = engineConfig.isCreate(); if (isCreateIndex) { isPrimaryInRouting = true; isSnapshotRecoveryFinished = true; } try { writer = createWriter(isCreateIndex); this.indexWriter = writer; Map<String, String> commitData = this.indexWriter.getCommitData(); this.commitDlsn = commitData.containsKey(Engine.COMMIT_DLSN) ? DLSN.deserialize(commitData.get(Engine.COMMIT_DLSN)) : DLSN.InitialDLSN; this.commitTxid = commitData.containsKey(Engine.COMMIT_TXID) ? Long.parseLong(commitData.get(Engine.COMMIT_TXID)) : 0; // create means // 1. for primary, it is force allocated to this node using allocate api or first created when create index // 2. for replica, it is always false // ignore translog status // 1. for primary, recovery from snapshot // 2. for replica it is always false if (isCreateIndex || engineConfig.getIngoreTranslogStatus()) { // if this is a create shard request, then use commit to create segment files // commit will also write latest dlsn and txid to commit file // snapshot recovery will set ingore translog status to true, then also rewrite dlsn and txid data commitIndexWriter(); } // it means this is primary recovery if (!skipInitialTranslogRecovery || isCreateIndex || engineConfig.getIngoreTranslogStatus()) { isSnapshotRecoveryFinished = true; } this.dlTranslog = new DistributedTranslog(engineConfig.getTranslogConfig(), localNodeId); } catch (IOException | TranslogCorruptedException e) { throw new EngineCreationFailureException(shardId, "failed to create engine", e); } catch (AssertionError e) { // IndexWriter throws AssertionError on init, if asserts are enabled, if any files don't exist, but tests that // randomly throw FNFE/NSFE can also hit this: if (ExceptionsHelper.stackTrace(e).contains("org.apache.lucene.index.IndexWriter.filesExist")) { throw new EngineCreationFailureException(shardId, "failed to create engine", e); } else { throw e; } } Map<String, String> commitUserData = this.indexWriter.getCommitData(); this.latestDlsn = commitUserData.containsKey(Engine.COMMIT_DLSN) ? DLSN.deserialize(commitUserData.get(Engine.COMMIT_DLSN)) : DLSN.InitialDLSN; this.latestTxid.set(commitUserData.containsKey(Engine.COMMIT_TXID) ? Long.parseLong(commitUserData.get(Engine.COMMIT_TXID)) : 0); // open search manager after recovery this.throttle = new IndexThrottle(); this.searcherFactory = new SearchFactory(this.logger, this.isClosed, engineConfig); manager = createSearcherManager(); this.searcherManager = manager; this.versionMap.setManager(searcherManager); checkLeaseTask = new CheckLeaseTask(isCreateIndex); threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.GENERIC, checkLeaseTask); threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.GENERIC, new TruncateOldLog()); success = true; } finally { if (success == false) { IOUtils.closeWhileHandlingException(writer, translog, manager, scheduler); this.versionMap.clear(); if (this.isClosed.get() == false) { // failure we need to dec the store reference this.store.decRef(); } } else { logger.info("engine started"); } } } @Override public boolean readAllowed() { if (!isPrimaryInRouting) { throw new IllegalStateException("[" + localNodeId + "] " + shardId.toString() + "shard is not marked as primary shard in routing state"); } if (!isPrimaryRecoverFinished) { throw new IllegalStateException("[" + localNodeId + "] " + shardId.toString() + "shard have not recovered as primary"); } if (System.currentTimeMillis() - lastLeaseSendTime > leaseExpireTime) { throw new IllegalStateException("[" + localNodeId + "] " + shardId.toString() + "shard maybe not have primary lease"); } return true; } public void onTranslogRecoveryFinished() { logger.debug("begin to start recovery from translog, current dlsn is [{}]", latestDlsn); this.isSnapshotRecoveryFinished = true; } public long getLatestTxid() { return this.latestTxid.get(); } public DLSN getLatestDLSN() { return this.latestDlsn; } public long getCommitTxid() { return this.commitTxid; } public DLSN getCommitDLSN() { return this.commitDlsn; } public void setIsPrimaryInRouting(boolean isPrimary) { this.isPrimaryInRouting = isPrimary; } private void applyTranslogOperation(Translog.Operation operation) { indexShard.markLastWrite(); final TranslogRecoveryPerformer handler = engineConfig.getTranslogRecoveryPerformer(); handler.performRecoveryOperation(this, operation, true); } private SearcherManager createSearcherManager() throws EngineException { boolean success = false; SearcherManager searcherManager = null; try { try { final DirectoryReader directoryReader = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(indexWriter, true), shardId); searcherManager = new SearcherManager(directoryReader, searcherFactory); lastCommittedSegmentInfos = readLastCommittedSegmentInfos(searcherManager, store); success = true; return searcherManager; } catch (IOException e) { maybeFailEngine("start", e); try { indexWriter.rollback(); } catch (IOException e1) { // iw is closed below e.addSuppressed(e1); } throw new EngineCreationFailureException(shardId, "failed to open reader on writer", e); } } finally { if (success == false) { // release everything we created on a failure IOUtils.closeWhileHandlingException(searcherManager, indexWriter); } } } private void updateIndexWriterSettings() { try { final LiveIndexWriterConfig iwc = indexWriter.getConfig(); iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac()); iwc.setUseCompoundFile(engineConfig.isCompoundOnFlush()); } catch (AlreadyClosedException ex) { // ignore } } @Override public GetResult get(Get get) throws EngineException { try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); if (get.realtime()) { VersionValue versionValue = versionMap.getUnderLock(get.uid().bytes()); if (versionValue != null) { if (versionValue.delete()) { return GetResult.NOT_EXISTS; } if (get.versionType().isVersionConflictForReads(versionValue.version(), get.version())) { Uid uid = Uid.createUid(get.uid().text()); throw new VersionConflictEngineException(shardId, uid.type(), uid.id(), versionValue.version(), get.version()); } if (dlTranslog != null) { Translog.Operation op = dlTranslog.readFromLocal(versionValue.translogLocation()); if (op != null) { return new GetResult(true, versionValue.version(), op.getSource()); } } } } return getFromSearcher(get); } } private void checkEngineStatus(Operation operation) { ensureOpen(); if (operation.origin() == Operation.Origin.PRIMARY) { if(!isPrimaryRecoverFinished) { throw new EngineException(shardId, "maybe the primary engine is in recovery"); } } else if (operation.origin() != Operation.Origin.RECOVERY) { throw new EngineException(shardId, String.format("only primary origin and recovery origin is supported, current [%s]", operation.origin())); } } @Override public void create(Create create) throws EngineException { try (ReleasableLock lock = readLock.acquire()) { checkEngineStatus(create); if (create.origin() == Operation.Origin.RECOVERY) { // Don't throttle recovery operations innerCreate(create); } else { readAllowed(); try (Releasable r = throttle.acquireThrottle()) { innerCreate(create); } } } catch (OutOfMemoryError | IllegalStateException | IOException t) { maybeFailEngine("create", t); throw new CreateFailedEngineException(shardId, create.type(), create.id(), t); } checkVersionMapRefresh(); } private void innerCreate(Create create) throws IOException { if (engineConfig.isOptimizeAutoGenerateId() && create.autoGeneratedId() && !create.canHaveDuplicates()) { innerCreateNoLock(create, Versions.NOT_FOUND, null); } else { synchronized (dirtyLock(create.uid())) { final long currentVersion; final VersionValue versionValue; versionValue = versionMap.getUnderLock(create.uid().bytes()); if (versionValue == null) { currentVersion = loadCurrentVersionFromIndex(create.uid()); } else { if (versionValue.delete() && ( create.origin() == Operation.Origin.RECOVERY || engineConfig.isEnableGcDeletes() && (engineConfig.getThreadPool().estimatedTimeInMillis() - versionValue.time()) > engineConfig.getGcDeletesInMillis())) { currentVersion = Versions.NOT_FOUND; // deleted, and GC } else { currentVersion = versionValue.version(); } } innerCreateNoLock(create, currentVersion, versionValue); } } } private void innerCreateNoLock(Create create, long currentVersion, VersionValue versionValue) throws IOException { long updatedVersion; long expectedVersion = create.version(); if (create.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) { // recovery means reindex or replay from dl // replay from dl: it never happens, because primary will check it // reindex: means user insert a new record, but reindex never go here if (create.origin() == Operation.Origin.RECOVERY) { logger.info("create origin recovery but meet create conflicts, should not happern {} {} {}", create.versionType(), currentVersion, expectedVersion); return; } else { throw new VersionConflictEngineException(shardId, create.type(), create.id(), currentVersion, expectedVersion); } } updatedVersion = create.versionType().updateVersion(currentVersion, expectedVersion); boolean doUpdate = false; if (create.origin() == Operation.Origin.RECOVERY) { updatedVersion = create.version(); if (versionValue != null || currentVersion != Versions.NOT_FOUND) { doUpdate = true; } } else if (create.origin() == Operation.Origin.PRIMARY) { if ((versionValue != null && versionValue.delete() == false) || (versionValue == null && currentVersion != Versions.NOT_FOUND)) { if (create.autoGeneratedId() && create.canHaveDuplicates() && currentVersion == 1 && create.version() == Versions.MATCH_ANY) { /** * If bulk index request fails due to a disconnect, unavailable shard etc. then the request is * retried before it actually fails. However, the documents might already be indexed. * For autogenerated ids this means that a version conflict will be reported in the bulk request * although the document was indexed properly. * To avoid this we have to make sure that the index request is treated as an update and set updatedVersion to 1. * See also discussion on https://github.com/elasticsearch/elasticsearch/pull/9125 */ doUpdate = true; updatedVersion = 1; } else { throw new DocumentAlreadyExistsException(shardId, create.type(), create.id()); } } } create.updateVersion(updatedVersion); // If it is a write request from primary, then should write the log first WriteDistributedLogCallBack callBack = null; if (create.origin() == Operation.Origin.PRIMARY) { callBack = new WriteDistributedLogCallBack(new Translog.Create(create)); } if (doUpdate) { if (create.docs().size() > 1) { indexWriter.updateDocuments(create.uid(), create.docs(), callBack); } else { indexWriter.updateDocument(create.uid(), create.docs().get(0), callBack); } } else { if (create.docs().size() > 1) { indexWriter.addDocuments(create.docs(), callBack); } else { indexWriter.addDocument(create.docs().get(0), callBack); } } if (callBack != null) { versionMap.putUnderLock(create.uid().bytes(), new VersionValue(updatedVersion, callBack.location)); create.setTranslogLocation(callBack.location); } else { versionMap.putUnderLock(create.uid().bytes(), new VersionValue(updatedVersion, null)); } indexingService.postCreateUnderLock(create); } @Override public boolean index(Index index) throws EngineException { final boolean created; try (ReleasableLock lock = readLock.acquire()) { checkEngineStatus(index); if (index.origin() == Operation.Origin.RECOVERY) { // Don't throttle recovery operations created = innerIndex(index); } else { readAllowed(); try (Releasable r = throttle.acquireThrottle()) { created = innerIndex(index); } } } catch (OutOfMemoryError | IllegalStateException | IOException t) { maybeFailEngine("index", t); throw new IndexFailedEngineException(shardId, index.type(), index.id(), t); } checkVersionMapRefresh(); return created; } /** * Forces a refresh if the versionMap is using too much RAM */ private void checkVersionMapRefresh() { if (versionMap.ramBytesUsedForRefresh() > config().getVersionMapSize().bytes() && versionMapRefreshPending.getAndSet(true) == false) { try { if (isClosed.get()) { // no point... return; } // Now refresh to clear versionMap: engineConfig.getThreadPool().executor(ThreadPool.Names.REFRESH).execute(new Runnable() { @Override public void run() { try { refresh("version_table_full"); } catch (EngineClosedException ex) { // ignore } } }); } catch (EsRejectedExecutionException ex) { // that is fine too.. we might be shutting down } } } private boolean innerIndex(Index index) throws IOException { synchronized (dirtyLock(index.uid())) { final long currentVersion; VersionValue versionValue = versionMap.getUnderLock(index.uid().bytes()); if (versionValue == null) { currentVersion = loadCurrentVersionFromIndex(index.uid()); } else { if (versionValue.delete() && ( index.origin() == Operation.Origin.RECOVERY || engineConfig.isEnableGcDeletes() && (engineConfig.getThreadPool().estimatedTimeInMillis() - versionValue.time()) > engineConfig.getGcDeletesInMillis())) { currentVersion = Versions.NOT_FOUND; // deleted, and GC } else { currentVersion = versionValue.version(); } } long expectedVersion = index.version(); long updatedVersion; // If it is a write request from primary, then should write the log first WriteDistributedLogCallBack callBack = null; if (index.origin() == Operation.Origin.PRIMARY && !index.reindex()) { if (index.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) { throw new VersionConflictEngineException(shardId, index.type(), index.id(), currentVersion, expectedVersion); } updatedVersion = index.versionType().updateVersion(currentVersion, expectedVersion); index.updateVersion(updatedVersion); callBack = new WriteDistributedLogCallBack(new Translog.Index(index)); } else { // for reindex request, the expected version has to equal to current version if (index.reindex()) { if (currentVersion != expectedVersion) { VersionConflictEngineException exception = new VersionConflictEngineException(shardId, index.type(), index.id(), currentVersion, expectedVersion); logger.info("reindex meet version conflict, maybe user deleted or updated the doc", exception); throw exception; } } updatedVersion = index.version(); index.updateVersion(updatedVersion); // has to update version here } final boolean created; if (currentVersion == Versions.NOT_FOUND) { created = true; if (index.docs().size() > 1) { indexWriter.addDocuments(index.docs(), callBack); } else { indexWriter.addDocument(index.docs().get(0), callBack); } } else { if (versionValue != null) { created = versionValue.delete(); // we have a delete which is not GC'ed... } else { created = false; } if (index.docs().size() > 1) { indexWriter.updateDocuments(index.uid(), index.docs(), callBack); } else { indexWriter.updateDocument(index.uid(), index.docs().get(0), callBack); } } if (callBack != null) { versionMap.putUnderLock(index.uid().bytes(), new VersionValue(updatedVersion, callBack.location)); index.setTranslogLocation(callBack.location); } else { versionMap.putUnderLock(index.uid().bytes(), new VersionValue(updatedVersion, null)); } indexingService.postIndexUnderLock(index); return created; } } @Override public void delete(Delete delete) throws EngineException { try (ReleasableLock lock = readLock.acquire()) { checkEngineStatus(delete); synchronized (dirtyLock(delete.uid())) { final long currentVersion; VersionValue versionValue = versionMap.getUnderLock(delete.uid().bytes()); if (versionValue == null) { currentVersion = loadCurrentVersionFromIndex(delete.uid()); } else { if (versionValue.delete() && ( delete.origin() == Operation.Origin.RECOVERY || engineConfig.isEnableGcDeletes() && (engineConfig.getThreadPool().estimatedTimeInMillis() - versionValue.time()) > engineConfig.getGcDeletesInMillis())) { currentVersion = Versions.NOT_FOUND; // deleted, and GC } else { currentVersion = versionValue.version(); } } WriteDistributedLogCallBack callBack = null; long updatedVersion; long expectedVersion = delete.version(); if (delete.origin() == Operation.Origin.PRIMARY) { readAllowed(); if (delete.versionType().isVersionConflictForWrites(currentVersion, expectedVersion)) { throw new VersionConflictEngineException(shardId, delete.type(), delete.id(), currentVersion, expectedVersion); } updatedVersion = delete.versionType().updateVersion(currentVersion, expectedVersion); } else { updatedVersion = delete.version(); } final boolean found; if (currentVersion == Versions.NOT_FOUND) { // doc does not exist and no prior deletes found = false; } else if (versionValue != null && versionValue.delete()) { // a "delete on delete", in this case, we still increment the version, log it, and return that version found = false; } else { // we deleted a currently existing document if (delete.origin() == Operation.Origin.PRIMARY) { callBack = new WriteDistributedLogCallBack(new Translog.Delete(delete)); } indexWriter.deleteDocuments(callBack, delete.uid()); found = true; } delete.updateVersion(updatedVersion, found); if (callBack != null) { versionMap.putUnderLock(delete.uid().bytes(), new DeleteVersionValue(updatedVersion, engineConfig.getThreadPool().estimatedTimeInMillis(), callBack.location)); delete.setTranslogLocation(callBack.location); } else { versionMap.putUnderLock(delete.uid().bytes(), new DeleteVersionValue(updatedVersion, engineConfig.getThreadPool().estimatedTimeInMillis(), null)); } indexingService.postDeleteUnderLock(delete); } } catch (OutOfMemoryError | IllegalStateException | IOException t) { maybeFailEngine("delete", t); throw new DeleteFailedEngineException(shardId, delete, t); } maybePruneDeletedTombstones(); checkVersionMapRefresh(); } private void maybePruneDeletedTombstones() { // It's expensive to prune because we walk the deletes map acquiring dirtyLock for each uid so we only do it // every 1/4 of gcDeletesInMillis: if (engineConfig.isEnableGcDeletes() && engineConfig.getThreadPool().estimatedTimeInMillis() - lastDeleteVersionPruneTimeMSec > engineConfig.getGcDeletesInMillis() * 0.25) { pruneDeletedTombstones(); } } @Override public void delete(DeleteByQuery delete) throws EngineException { throw new EngineException(shardId, "delete by query is not supported"); } @Override public void refresh(String source) throws EngineException { // we obtain a read lock here, since we don't want a flush to happen while we are refreshing // since it flushes the index as well (though, in terms of concurrency, we are allowed to do it) logger.info("begin to refresh due to [{}]", source); try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); if (dlTranslog != null) { long currentGen = dlTranslog.getLocalTranslogGeneration(); searcherManager.maybeRefreshBlocking(); dlTranslog.deleteLogBeforeGeneration(currentGen); } else { searcherManager.maybeRefreshBlocking(); } } catch (AlreadyClosedException e) { ensureOpen(); maybeFailEngine("refresh", e); logger.error("errors while refresh index", e); } catch (EngineClosedException e) { throw e; } catch (Throwable t) { failEngine("refresh failed", t); throw new RefreshFailedEngineException(shardId, t); } // TODO: maybe we should just put a scheduled job in threadPool? // We check for pruning in each delete request, but we also prune here e.g. in case a delete burst comes in and then no more deletes // for a long time: maybePruneDeletedTombstones(); versionMapRefreshPending.set(false); mergeScheduler.refreshConfig(); } @Override public SyncedFlushResult syncFlush(String syncId, CommitId expectedCommitId) throws EngineException { // syncFlush is used to optimise recovery process, it is useless when using dlsn // see https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-synced-flush.html return SyncedFlushResult.SUCCESS; } private void commitIndexWriter() throws IOException { try { Map<String, String> commitData = new HashMap<>(2); // when commit, lucene will call flushAllThreads first, so dlsn and txid may be a litter smaller than // data stored commitData.put(Engine.COMMIT_DLSN, latestDlsn.serialize()); commitData.put(Engine.COMMIT_TXID, Long.toString(latestTxid.get())); indexWriter.setCommitData(commitData); indexWriter.commit(); logger.trace("finished commit for flush"); } catch (Throwable ex) { failEngine("lucene commit failed", ex); throw ex; } } @Override public CommitId flush() throws EngineException { return flush(false, false); } @Override public CommitId flush(boolean force, boolean waitIfOngoing) throws EngineException { ensureOpen(); final byte[] newCommitId; /* * Unfortunately the lock order is important here. We have to acquire the readlock first otherwise * if we are flushing at the end of the recovery while holding the write lock we can deadlock if: * Thread 1: flushes via API and gets the flush lock but blocks on the readlock since Thread 2 has the writeLock * Thread 2: flushes at the end of the recovery holding the writeLock and blocks on the flushLock owned by Thread 1 */ try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); if (flushLock.tryLock() == false) { // if we can't get the lock right away we block if needed otherwise barf if (waitIfOngoing) { logger.debug("waiting for in-flight flush to finish"); flushLock.lock(); logger.debug("acquired flush lock after blocking"); } else { throw new FlushNotAllowedEngineException(shardId, "already flushing..."); } } else { logger.debug("acquired flush lock immediately"); } try { if (indexWriter.hasUncommittedChanges() || force) { try { logger.debug("prepare commit"); dlTranslog.prepareCommit(); logger.debug("commit writer"); commitIndexWriter(); // we need to refresh in order to clear older version values refresh("version_table_flush"); logger.debug("commit finished"); dlTranslog.commit(); } catch (Throwable e) { throw new FlushFailedEngineException(shardId, e); } } /* * we have to inc-ref the store here since if the engine is closed by a tragic event * we don't acquire the write lock and wait until we have exclusive access. This might also * dec the store reference which can essentially close the store and unless we can inc the reference * we can't use it. */ store.incRef(); try { // reread the last committed segment infos lastCommittedSegmentInfos = store.readLastCommittedSegmentsInfo(); Map<String, String> commitData = indexWriter.getCommitData(); this.commitDlsn = commitData.containsKey(Engine.COMMIT_DLSN) ? DLSN.deserialize(commitData.get(Engine.COMMIT_DLSN)) : DLSN.InitialDLSN; this.commitTxid = commitData.containsKey(Engine.COMMIT_TXID) ? Long.parseLong(commitData.get(Engine.COMMIT_TXID)) : 0; } catch (Throwable e) { logger.warn("errors while commit data", e); if (isClosed.get() == false) { logger.warn("failed to read latest segment infos on flush", e); if (Lucene.isCorruptionException(e)) { throw new FlushFailedEngineException(shardId, e); } } } finally { store.decRef(); } newCommitId = lastCommittedSegmentInfos.getId(); if (newCommitId == null) { logger.warn("commitid is null"); } } catch (FlushFailedEngineException ex) { maybeFailEngine("flush", ex); throw ex; } finally { flushLock.unlock(); } } // We don't have to do this here; we do it defensively to make sure that even if wall clock time is misbehaving // (e.g., moves backwards) we will at least still sometimes prune deleted tombstones: if (engineConfig.isEnableGcDeletes()) { pruneDeletedTombstones(); } if (newCommitId == null) { logger.warn("commitid is null"); } return new CommitId(newCommitId); } private void pruneDeletedTombstones() { long timeMSec = engineConfig.getThreadPool().estimatedTimeInMillis(); // TODO: not good that we reach into LiveVersionMap here; can we move this inside VersionMap instead? problem is the dirtyLock... // we only need to prune the deletes map; the current/old version maps are cleared on refresh: for (Map.Entry<BytesRef, VersionValue> entry : versionMap.getAllTombstones()) { BytesRef uid = entry.getKey(); synchronized (dirtyLock(uid)) { // can we do it without this lock on each value? maybe batch to a set and get the lock once per set? // Must re-get it here, vs using entry.getValue(), in case the uid was indexed/deleted since we pulled the iterator: VersionValue versionValue = versionMap.getTombstoneUnderLock(uid); if (versionValue != null) { if (timeMSec - versionValue.time() > engineConfig.getGcDeletesInMillis()) { versionMap.removeTombstoneUnderLock(uid); } } } } lastDeleteVersionPruneTimeMSec = timeMSec; } @Override public void forceMerge(final boolean flush, int maxNumSegments, boolean onlyExpungeDeletes, final boolean upgrade, final boolean upgradeOnlyAncientSegments) throws EngineException, EngineClosedException, IOException { /* * We do NOT acquire the readlock here since we are waiting on the merges to finish * that's fine since the IW.rollback should stop all the threads and trigger an IOException * causing us to fail the forceMerge * * The way we implement upgrades is a bit hackish in the sense that we set an instance * variable and that this setting will thus apply to the next forced merge that will be run. * This is ok because (1) this is the only place we call forceMerge, (2) we have a single * thread for optimize, and the 'optimizeLock' guarding this code, and (3) ConcurrentMergeScheduler * syncs calls to findForcedMerges. */ assert indexWriter.getConfig().getMergePolicy() instanceof ElasticsearchMergePolicy : "MergePolicy is " + indexWriter.getConfig().getMergePolicy().getClass().getName(); ElasticsearchMergePolicy mp = (ElasticsearchMergePolicy) indexWriter.getConfig().getMergePolicy(); optimizeLock.lock(); try { ensureOpen(); if (upgrade) { logger.info("starting segment upgrade upgradeOnlyAncientSegments={}", upgradeOnlyAncientSegments); mp.setUpgradeInProgress(true, upgradeOnlyAncientSegments); } store.incRef(); // increment the ref just to ensure nobody closes the store while we optimize try { if (onlyExpungeDeletes) { assert upgrade == false; indexWriter.forceMergeDeletes(true /* blocks and waits for merges*/); } else if (maxNumSegments <= 0) { assert upgrade == false; indexWriter.maybeMerge(); } else { indexWriter.forceMerge(maxNumSegments, true /* blocks and waits for merges*/); } if (flush) { flush(true, true); } if (upgrade) { logger.info("finished segment upgrade"); } } finally { store.decRef(); } } catch (Throwable t) { maybeFailEngine("force merge", t); throw t; } finally { try { mp.setUpgradeInProgress(false, false); // reset it just to make sure we reset it in a case of an error } finally { optimizeLock.unlock(); } } } @Override public SnapshotIndexCommit snapshotIndex(final boolean flushFirst) throws EngineException { // we have to flush outside of the readlock otherwise we might have a problem upgrading // the to a write lock when we fail the engine in this operation if (flushFirst) { logger.trace("start flush for snapshot"); flush(false, true); logger.trace("finish flush for snapshot"); } try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); logger.trace("pulling snapshot"); return deletionPolicy.snapshot(); } catch (IOException e) { throw new SnapshotFailedEngineException(shardId, e); } } @Override protected boolean maybeFailEngine(String source, Throwable t) { boolean shouldFail = super.maybeFailEngine(source, t); if (shouldFail) { return true; } return false; } @Override protected SegmentInfos getLastCommittedSegmentInfos() { return lastCommittedSegmentInfos; } @Override protected final void writerSegmentStats(SegmentsStats stats) { stats.addVersionMapMemoryInBytes(versionMap.ramBytesUsed()); stats.addIndexWriterMemoryInBytes(indexWriter.ramBytesUsed()); stats.addIndexWriterMaxMemoryInBytes((long) (indexWriter.getConfig().getRAMBufferSizeMB() * 1024 * 1024)); } @Override public long indexWriterRAMBytesUsed() { return indexWriter.ramBytesUsed(); } @Override public List<Segment> segments(boolean verbose) { try (ReleasableLock lock = readLock.acquire()) { Segment[] segmentsArr = getSegmentInfo(lastCommittedSegmentInfos, verbose); // fill in the merges flag Set<OnGoingMerge> onGoingMerges = mergeScheduler.onGoingMerges(); for (OnGoingMerge onGoingMerge : onGoingMerges) { for (SegmentCommitInfo segmentInfoPerCommit : onGoingMerge.getMergedSegments()) { for (Segment segment : segmentsArr) { if (segment.getName().equals(segmentInfoPerCommit.info.name)) { segment.mergeId = onGoingMerge.getId(); break; } } } } return Arrays.asList(segmentsArr); } } /** * Closes the engine without acquiring the write lock. This should only be * called while the write lock is hold or in a disaster condition ie. if the engine * is failed. */ @Override protected final void closeNoLock(String reason) { resourceLock.lock(); try { if (isClosed.compareAndSet(false, true)) { assert rwl.isWriteLockedByCurrentThread() || failEngineLock.isHeldByCurrentThread() : "Either the write lock must be held or the engine must be currently be failing itself"; try { this.versionMap.clear(); try { IOUtils.close(searcherManager); } catch (Throwable t) { logger.warn("Failed to close SearcherManager", t); } // if write lock is held, then we should not stop the recover thread here // it will be a dead lock if (!rwl.isWriteLockedByCurrentThread()) { recoverFromDLTask.stop(true); } try { IOUtils.close(dlTranslog); } catch (Throwable t) { logger.warn("Failed to close translog", t); } // no need to commit in this case!, we snapshot before we close the shard, so translog and all sync'ed logger.trace("rollback indexWriter"); try { indexWriter.rollback(); } catch (AlreadyClosedException e) { // ignore } logger.trace("rollback indexWriter done"); } catch (Throwable e) { logger.warn("failed to rollback writer on close", e); } finally { store.decRef(); logger.debug("engine closed [{}]", reason); } } } finally { resourceLock.unlock(); } } protected void afterClose(String reason) { recoverFromDLTask.stop(true); } @Override public boolean hasUncommittedChanges() { return indexWriter.hasUncommittedChanges(); } @Override protected SearcherManager getSearcherManager() { return searcherManager; } private Object dirtyLock(BytesRef uid) { int hash = DjbHashFunction.DJB_HASH(uid.bytes, uid.offset, uid.length); return dirtyLocks[MathUtils.mod(hash, dirtyLocks.length)]; } private Object dirtyLock(Term uid) { return dirtyLock(uid.bytes()); } private long loadCurrentVersionFromIndex(Term uid) throws IOException { try (final Searcher searcher = acquireSearcher("load_version", false)) { return Versions.loadVersion(searcher.reader(), uid); } } private IndexWriter createWriter(boolean create) throws IOException { try { final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer()); iwc.setCommitOnClose(false); // we by default don't commit on close iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); iwc.setIndexDeletionPolicy(deletionPolicy); // with tests.verbose, lucene sets this up: plumb to align with filesystem stream boolean verbose = false; try { verbose = Boolean.parseBoolean(System.getProperty("tests.verbose")); } catch (Throwable ignore) { } iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger)); iwc.setMergeScheduler(mergeScheduler); MergePolicy mergePolicy = config().getMergePolicy(); // Give us the opportunity to upgrade old segments while performing // background merges mergePolicy = new ElasticsearchMergePolicy(mergePolicy); iwc.setMergePolicy(mergePolicy); iwc.setSimilarity(engineConfig.getSimilarity()); iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac()); iwc.setCodec(engineConfig.getCodec()); /* We set this timeout to a highish value to work around * the default poll interval in the Lucene lock that is * 1000ms by default. We might need to poll multiple times * here but with 1s poll this is only executed twice at most * in combination with the default writelock timeout*/ iwc.setWriteLockTimeout(5000); iwc.setUseCompoundFile(this.engineConfig.isCompoundOnFlush()); // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end // of the merge operation and won't slow down _refresh iwc.setMergedSegmentWarmer(new IndexReaderWarmer() { @Override public void warm(LeafReader reader) throws IOException { try { LeafReader esLeafReader = new ElasticsearchLeafReader(reader, shardId); assert isMergedSegment(esLeafReader); if (warmer != null) { final Engine.Searcher searcher = new Searcher("warmer", searcherFactory.newSearcher(esLeafReader, null)); final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId, searcher); warmer.warmNewReaders(context); } } catch (Throwable t) { // Don't fail a merge if the warm-up failed if (isClosed.get() == false) { logger.warn("Warm-up failed", t); } if (t instanceof Error) { // assertion/out-of-memory error, don't ignore those throw (Error) t; } } } }); return new IndexWriter(store.directory(), iwc); } catch (LockObtainFailedException ex) { boolean isLocked = IndexWriter.isLocked(store.directory()); logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked); throw ex; } } public void activateThrottling() { throttle.activate(); } public void deactivateThrottling() { throttle.deactivate(); } long getGcDeletesInMillis() { return engineConfig.getGcDeletesInMillis(); } LiveIndexWriterConfig getCurrentIndexWriterConfig() { return indexWriter.getConfig(); } public void onSettingsChanged() { mergeScheduler.refreshConfig(); updateIndexWriterSettings(); // config().getVersionMapSize() may have changed: checkVersionMapRefresh(); // config().isEnableGcDeletes() or config.getGcDeletesInMillis() may have changed: maybePruneDeletedTombstones(); } public MergeStats getMergeStats() { return mergeScheduler.stats(); } @Override public Translog getTranslog() { return dlTranslog; } /** Extended SearcherFactory that warms the segments if needed when acquiring a new searcher */ final static class SearchFactory extends EngineSearcherFactory { private final IndicesWarmer warmer; private final ShardId shardId; private final ESLogger logger; private final AtomicBoolean isEngineClosed; SearchFactory(ESLogger logger, AtomicBoolean isEngineClosed, EngineConfig engineConfig) { super(engineConfig); warmer = engineConfig.getWarmer(); shardId = engineConfig.getShardId(); this.logger = logger; this.isEngineClosed = isEngineClosed; } @Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) throws IOException { IndexSearcher searcher = super.newSearcher(reader, previousReader); if (reader instanceof LeafReader && isMergedSegment((LeafReader)reader)) { // we call newSearcher from the IndexReaderWarmer which warms segments during merging // in that case the reader is a LeafReader and all we need to do is to build a new Searcher // and return it since it does it's own warming for that particular reader. return searcher; } if (warmer != null) { // we need to pass a custom searcher that does not release anything on Engine.Search Release, // we will release explicitly IndexSearcher newSearcher = null; boolean closeNewSearcher = false; try { if (previousReader == null) { // we are starting up - no writer active so we can't acquire a searcher. newSearcher = searcher; } else { // figure out the newSearcher, with only the new readers that are relevant for us List<IndexReader> readers = new ArrayList<>(); for (LeafReaderContext newReaderContext : reader.leaves()) { if (isMergedSegment(newReaderContext.reader())) { // merged segments are already handled by IndexWriterConfig.setMergedSegmentWarmer continue; } boolean found = false; for (LeafReaderContext currentReaderContext : previousReader.leaves()) { if (currentReaderContext.reader().getCoreCacheKey().equals(newReaderContext.reader().getCoreCacheKey())) { found = true; break; } } if (!found) { readers.add(newReaderContext.reader()); } } if (!readers.isEmpty()) { // we don't want to close the inner readers, just increase ref on them IndexReader newReader = new MultiReader(readers.toArray(new IndexReader[readers.size()]), false); newSearcher = super.newSearcher(newReader, null); closeNewSearcher = true; } } if (newSearcher != null) { IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId, new Searcher("new_reader_warming", newSearcher)); warmer.warmNewReaders(context); } assert searcher.getIndexReader() instanceof ElasticsearchDirectoryReader : "this class needs an ElasticsearchDirectoryReader but got: " + searcher.getIndexReader().getClass(); warmer.warmTopReader(new IndicesWarmer.WarmerContext(shardId, new Searcher("top_reader_warming", searcher))); } catch (Throwable e) { if (isEngineClosed.get() == false) { logger.warn("failed to prepare/warm", e); } } finally { // no need to release the fullSearcher, nothing really is done... if (newSearcher != null && closeNewSearcher) { IOUtils.closeWhileHandlingException(newSearcher.getIndexReader()); // ignore } } } return searcher; } } private class WriteDistributedLogCallBack implements IndexCallBack { private Translog.Operation operation; private DLSN allocatedDLSN; private Translog.Location location; public WriteDistributedLogCallBack(Translog.Operation operation) { this.operation = operation; this.allocatedDLSN = null; } @Override public void callback() throws AbortingException { try { Tuple<Future<DLSN>, Tuple<BytesReference, Long>> result = dlTranslog.writeOperation(operation, latestTxid); long startTimestamp = System.currentTimeMillis(); try { allocatedDLSN = FutureUtils.result(result.v1(), Duration.fromMilliseconds(dlWriteTimeoutMillis)); logger.debug("write data successfully, dlsn [{}], txid [{}]", allocatedDLSN, result.v2().v2()); latestDlsn = allocatedDLSN; long endTimestamp = System.currentTimeMillis(); if (endTimestamp - startTimestamp > 50) { logger.warn("cost [{}] millis to write to dl, more than 50 millis", endTimestamp - startTimestamp); } location = dlTranslog.writeToLocal(result.v2().v1()); return; } catch (Throwable t) { logger.error("errors when write operation log to dl", t); throw AbortingException.wrap(t); } } catch (Throwable t) { logger.error("errors while write log to distributed log service", t); DLBasedEngine.this.checkLeaseTask.forceRecovery(); throw AbortingException.wrap(t); } } } private class CheckLeaseTask implements Runnable { private AtomicLong expireVersion = new AtomicLong(1); private long recoveryVersion = 0; private long recoveryTargetTxid = -1; private boolean waitRecoverTaskStop = false; private String indexUUID; private boolean isCreateIndex = false; public CheckLeaseTask(boolean isCreateIndex) { this.indexUUID = engineConfig.getIndexSettings().get(IndexMetaData.SETTING_INDEX_UUID, IndexMetaData.INDEX_UUID_NA_VALUE); this.isCreateIndex = isCreateIndex; } public void forceRecovery() { isPrimaryRecoverFinished = false; expireVersion.incrementAndGet(); } @Override public void run() { if (isClosed.get()) { logger.debug("shard is closed, skip check lease process"); return; } while (!isClosed.get()) { try { DLBasedEngine.this.dlTranslog.init(); break; } catch (Throwable e) { logger.warn("errors while create or delete log in dl, maybe zk or bk is done, sleep 1 second and check again", e); try { Thread.sleep(1000); } catch (InterruptedException ie) { logger.error("errors while sleep", ie); } continue; } } long nextSendDelay = 500; try { if (isPrimaryInRouting && isSnapshotRecoveryFinished) { long sendTime = System.currentTimeMillis(); GetOrChangePrimaryShardLeaseRequest checkLeaseRequest = new GetOrChangePrimaryShardLeaseRequest(localNodeId, true, shardId.getIndex(), indexUUID, shardId.getId()); ActionFuture<GetOrChangePrimaryShardLeaseResponse> future = checkLeaseAction.execute(checkLeaseRequest); GetOrChangePrimaryShardLeaseResponse response = future.actionGet(500); boolean isShardOwnLease = response.checkShardStatus(shardId.getIndex(), indexUUID, shardId.getId()); if (!isShardOwnLease) { GetOrChangePrimaryShardLeaseRequest updateLeaseOwnerRequest = new GetOrChangePrimaryShardLeaseRequest(localNodeId, false, shardId.getIndex(), indexUUID, shardId.getId()); ActionFuture<GetOrChangePrimaryShardLeaseResponse> updateFuture = checkLeaseAction.execute(updateLeaseOwnerRequest); if (!this.isCreateIndex) { Thread.sleep(leaseExpireTime); } updateFuture.actionGet(500); nextSendDelay = 0; logger.info("lease is changed, write the meta and wait for expired"); expireVersion.incrementAndGet(); waitRecoverTaskStop = false; } else { this.isCreateIndex = false; lastLeaseSendTime = sendTime; } if (System.currentTimeMillis() - lastLeaseSendTime < leaseExpireTime / 2) { nextSendDelay = Math.min(leaseExpireTime / 5, nextSendDelay); } if (System.currentTimeMillis() - lastLeaseSendTime > leaseExpireTime) { isPrimaryRecoverFinished = false; expireVersion.incrementAndGet(); waitRecoverTaskStop = false; logger.debug("lease is expired"); } else { long tempExpireVersion = this.expireVersion.get(); if (tempExpireVersion != recoveryVersion) { isPrimaryRecoverFinished = false; resourceLock.lock(); try { if (!isClosed.get()) { DLBasedEngine.this.dlTranslog.closeLogWriter(); DLBasedEngine.this.dlTranslog.openLogWriter(); } } catch (Throwable t) { logger.error("errors while close and reopen log writer", t); throw t; } finally { resourceLock.unlock(); } recoveryTargetTxid = DLBasedEngine.this.dlTranslog.getLogWriter().getLastTxId(); DLBasedEngine.this.recoverFromDLTask.checkResource(true); logger.debug("last record in distributed log is txid:[{}], while local state is dlsn[{}], txid[{}]", recoveryTargetTxid, DLBasedEngine.this.latestDlsn, DLBasedEngine.this.latestTxid); // should reset latest dlsn and latest txid here, because the latest dlsn maybe larger than dlsn in log Map<String, String> commitUserData = DLBasedEngine.this.indexWriter.getCommitData(); DLBasedEngine.this.latestDlsn = commitUserData.containsKey(Engine.COMMIT_DLSN) ? DLSN.deserialize(commitUserData.get(Engine.COMMIT_DLSN)) : DLSN.InitialDLSN; DLBasedEngine.this.latestTxid.set(commitUserData.containsKey(Engine.COMMIT_TXID) ? Long.parseLong(commitUserData.get(Engine.COMMIT_TXID)) : 0); logger.debug("last record in distributed log is txid:[{}], while local state is dlsn[{}], txid[{}]", recoveryTargetTxid, DLBasedEngine.this.latestDlsn, DLBasedEngine.this.latestTxid); DLBasedEngine.this.recoverFromDLTask.stop(true); CheckLeaseTask.this.recoveryVersion = tempExpireVersion; } if (!isPrimaryRecoverFinished) { if (!waitRecoverTaskStop && !DLBasedEngine.this.recoverFromDLTask.isActualRunning()) { if (DLBasedEngine.this.latestTxid.get() < recoveryTargetTxid || !indexWriter.isOpen()) { DLBasedEngine.this.recoverFromDLTask.start("primary shard's lease is expired, should recover from dl first"); } } if (recoveryVersion == expireVersion.get()) { if (DLBasedEngine.this.latestTxid.get() >= recoveryTargetTxid && indexWriter.isOpen()) { try (ReleasableLock lock = writeLock.acquire()) { DLBasedEngine.this.recoverFromDLTask.stop(true); waitRecoverTaskStop = true; if (!DLBasedEngine.this.recoverFromDLTask.isActualRunning()) { isPrimaryRecoverFinished = true; waitRecoverTaskStop = false; logger.debug("primary recover from dl finished"); } refresh("changed_to_primary"); versionMap.clear(); versionMap.setManager(searcherManager); } } } } } } else { this.isCreateIndex = false; // the primary maybe transfered to replica, then should call close writer to release the resource dlTranslog.closeLogWriter(); long tempExpireVersion = this.expireVersion.get(); if (recoveryVersion != tempExpireVersion) { DLBasedEngine.this.recoverFromDLTask.stop(true); if (!recoverFromDLTask.isActualRunning()) { DLBasedEngine.this.recoverFromDLTask.start("primary shard maybe transfered to replica, should start replay task"); recoveryVersion = tempExpireVersion; } } else if (!recoverFromDLTask.isActualRunning()) { DLBasedEngine.this.recoverFromDLTask.stop(true); DLBasedEngine.this.recoverFromDLTask.start("replay dl operation task is not running, start it"); } } logger.trace("check lease after [{}] millis", nextSendDelay); threadPool.schedule(TimeValue.timeValueMillis(nextSendDelay), ThreadPool.Names.GENERIC, this); } catch (Throwable t) { logger.error("errors while check lease", t); threadPool.schedule(TimeValue.timeValueMillis(nextSendDelay), ThreadPool.Names.GENERIC, this); } } } private class TruncateOldLog implements Runnable { @Override public void run() { try { if (isPrimaryInRouting) { ActionFuture<IndexShardStatsResponse> result = indexShardStatsAction.execute(new IndexShardStatsRequest(shardId.index().getName(), shardId.id())); IndexShardStatsResponse response = result.actionGet(10000); if (response.getTotalShards() == response.getSuccessfulShards() && response.getTotalShards() > 0) { DLSN miniumDlsn = new DLSN(Long.MAX_VALUE, 0, 0); boolean hasInvalidShardStats = false; for (ShardStats shardStats : response.getShards()) { boolean isInvalid = true; CommonStats commonStats = shardStats.getStats(); if (commonStats != null) { DLStats dlStats = commonStats.getDLStats(); if (dlStats != null && !dlStats.getCommitDlsn().equals(DLSN.InvalidDLSN)) { isInvalid = false; if (dlStats.getCommitDlsn().compareTo(miniumDlsn) < 0) { miniumDlsn = dlStats.getCommitDlsn(); } } else { logger.debug("dl [{}] status is invalid", dlStats); } } else { logger.debug("common stats is null"); } if (isInvalid) { hasInvalidShardStats = true; break; } } if (!hasInvalidShardStats) { if (!miniumDlsn.equals(DLSN.InitialDLSN)) { logger.info("try to truncate log before [{}]", miniumDlsn); dlTranslog.truncateLogBeforeDLSN(miniumDlsn); } } } else { logger.warn("some shards failed to get stats: total shards [{}], shards give reponse [{}], failure exceptions [{}], maybe some shard is abnormal so skip truncate log", response.getTotalShards(), response.getSuccessfulShards(), response.getShardFailures().length > 0 ? response.getShardFailures()[0] : ""); } } } catch (Throwable t) { logger.error("catch exception when sleep to next lease check time", t); } finally { if (!isClosed.get()) { // check in 1 minutes threadPool.schedule(TimeValue.timeValueMinutes(1), ThreadPool.Names.GENERIC, this); } } } } private class ReplayDLOperationTask implements Runnable { private boolean needCheckResource = true; private boolean desireRunning = true; private Future<List<LogRecordWithDLSN>> readFuture; private long numberOfFailures = 0; private Throwable exception = null; private volatile boolean indexTaskRunning = false; private volatile boolean isCheckTaskRunning = false; private volatile ScheduledFuture<?> checkTaskFuture = null; private Lock checkTaskLock = new ReentrantLock(); public void start(String reason) { logger.info(reason); this.desireRunning = true; this.needCheckResource = true; this.readFuture = null; checkTaskLock.lock(); checkTaskFuture = threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.GENERIC, this); checkTaskLock.unlock(); } public void stop(boolean waitToStop) { this.desireRunning = false; this.cancelReadFuture(); if (dlTranslog != null) { dlTranslog.closeLogReader(); } this.needCheckResource = true; } public boolean isActualRunning() { return indexTaskRunning || isCheckTaskRunning || checkTaskFuture != null || readFuture != null; } private void setCheckTaskFuture(ScheduledFuture<?> newCheckTaskFuture) { checkTaskLock.lock(); checkTaskFuture = newCheckTaskFuture; checkTaskLock.unlock(); } public void run() { isCheckTaskRunning = true; setCheckTaskFuture(null); try { if (!this.desireRunning || DLBasedEngine.this.isClosed.get()) { dlTranslog.closeLogReader(); setCheckTaskFuture(null); return; } if (exception != null) { throw exception; } checkResource(false); readAndReplayOperation(); numberOfFailures = 0; } catch (Throwable t) { exception = null; needCheckResource = true; cancelReadFuture(); dlTranslog.closeLogReader(); ++ numberOfFailures; if (numberOfFailures > 100) { t = new RecoveryFromDistributedLogFailedException(shardId, "failed to recovery from dl, retried 100 times", t); } boolean engineFailed = maybeFailEngine("[recover from distributed log service]", t); if (engineFailed) { logger.error("errors in lucene engine, so fail this shard", t); return; } else { if (this.desireRunning && !DLBasedEngine.this.isClosed.get()) { // exception occurs should wait 1 seconds and check again // call refresh to clear version map refresh("replay log failed, refresh it before replay again"); logger.error("errors while read operation from distributed log service and apply to local engine, fail no. [{}]", t, numberOfFailures); checkTaskLock.lock(); checkTaskFuture = threadPool.schedule(TimeValue.timeValueMillis(500), ThreadPool.Names.GENERIC, this); checkTaskLock.unlock(); } else { logger.info("recover task is stopped, desire running status is [{}], is closed status is [{}]", this.desireRunning, DLBasedEngine.this.isClosed.get()); return; } } } finally { isCheckTaskRunning = false; } } private void cancelReadFuture() { try { if (readFuture != null) { FutureUtils.cancel(readFuture); } } catch (Throwable t) { logger.error("errors while cancel read future", t); } finally { readFuture = null; } } public void checkResource(boolean skipCheck) throws Throwable { if (!needCheckResource && !skipCheck) { return; } resourceLock.lock(); try { if (isClosed.get()) { logger.info("engine is closed, just skip check resource"); return; } // if dl service is down, could not open reader, but the index writer will always open // the index writer will not be opened and closed repeatly if (!indexWriter.isOpen()) { versionMap.clear(); try { IOUtils.close(searcherManager); } catch (Throwable t) { logger.warn("Failed to close SearcherManager when check resource", t); } indexWriter = createWriter(false); searcherManager = createSearcherManager(); versionMap.setManager(searcherManager); } if (dlTranslog.isReaderClosed()) { logger.info("try to open a new reader at [{}]", latestDlsn); dlTranslog.openLogReader(latestDlsn); } } catch (Throwable t) { logger.error("errors while create writer or reader, throw the exception to test if need to fail the shard", t); throw t; } finally { resourceLock.unlock(); } DLSN firstDlsn = DLSN.InitialDLSN; firstDlsn = dlTranslog.getFirstLogDLSN(); logger.debug("current latest dlsn is [{}], the first dlsn in dl is [{}]", latestDlsn, firstDlsn); if (firstDlsn != null && firstDlsn.compareTo(latestDlsn) > 0) { throw new RecoveryFromDistributedLogFailedException(shardId, String.format("dlsn in lucene is [%s], while the first dlsn in log service is [%s], newer than current so that could not recovery", latestDlsn, firstDlsn)); } needCheckResource = false; } private void readAndReplayOperation() throws Throwable { if (readFuture == null) { // 10 is suitable, it means will read at most 10 records, it maybe less than 10 records readFuture = dlTranslog.readBatchOperations(10); readFuture.addEventListener(new FutureEventListener<List<LogRecordWithDLSN>>() { @Override public void onFailure(Throwable t) { exception = t; if (desireRunning && !DLBasedEngine.this.isClosed.get()) { checkTaskLock.lock(); checkTaskFuture = threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.GENERIC, ReplayDLOperationTask.this); checkTaskLock.unlock(); } } @Override public void onSuccess(final List<LogRecordWithDLSN> logRecordWithDLSNs) { threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.GENERIC, new Runnable() { @Override public void run() { try { if (!desireRunning || DLBasedEngine.this.isClosed.get()) { return; } indexTaskRunning = true; if (logRecordWithDLSNs == null) { throw new Exception("read result from dl is empty, maybe the read thread is stopped or the engine is closed"); } long sizeInBytes = 0; int numOperations = 0; for (LogRecordWithDLSN logRecordWithDLSN : logRecordWithDLSNs) { if (!logRecordWithDLSN.isControl()) { Translog.Operation operation = DistributedTranslog.getOperationFromLogRecord(logRecordWithDLSN); sizeInBytes += (20 + logRecordWithDLSN.getPayload().length); ++ numOperations; applyTranslogOperation(operation); } if (Math.abs(latestTxid.get() - logRecordWithDLSN.getTransactionId()) > 1) { logger.debug("the txid is not continuous, current is [{}], the new is [{}], current dlsn is [{}], previous dlsn is [{}]", latestTxid.get(), logRecordWithDLSN.getTransactionId(), latestDlsn, logRecordWithDLSN.getDlsn()); } latestDlsn = logRecordWithDLSN.getDlsn(); latestTxid.set(logRecordWithDLSN.getTransactionId()); } dlTranslog.incrementSizeAndOperation(sizeInBytes, numOperations); readFuture = null; } catch (Throwable t) { exception = t; } finally { indexTaskRunning = false; if (desireRunning && !DLBasedEngine.this.isClosed.get()) { checkTaskLock.lock(); checkTaskFuture = threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.GENERIC, ReplayDLOperationTask.this); checkTaskLock.unlock(); } } } }); } }); } } } private final class EngineMergeScheduler extends ElasticsearchConcurrentMergeScheduler { private final AtomicInteger numMergesInFlight = new AtomicInteger(0); private final AtomicBoolean isThrottling = new AtomicBoolean(); EngineMergeScheduler(ShardId shardId, Settings indexSettings, MergeSchedulerConfig config) { super(shardId, indexSettings, config); } @Override public synchronized void beforeMerge(OnGoingMerge merge) { int maxNumMerges = mergeScheduler.getMaxMergeCount(); if (numMergesInFlight.incrementAndGet() > maxNumMerges) { if (isThrottling.getAndSet(true) == false) { logger.info("now throttling indexing: numMergesInFlight={}, maxNumMerges={}", numMergesInFlight, maxNumMerges); indexingService.throttlingActivated(); activateThrottling(); } } } @Override public synchronized void afterMerge(OnGoingMerge merge) { int maxNumMerges = mergeScheduler.getMaxMergeCount(); if (numMergesInFlight.decrementAndGet() < maxNumMerges) { if (isThrottling.getAndSet(false)) { logger.info("stop throttling indexing: numMergesInFlight={}, maxNumMerges={}", numMergesInFlight, maxNumMerges); indexingService.throttlingDeactivated(); deactivateThrottling(); } } } @Override protected void handleMergeException(final Directory dir, final Throwable exc) { logger.error("failed to merge", exc); if (config().getMergeSchedulerConfig().isNotifyOnMergeFailure()) { engineConfig.getThreadPool().generic().execute(new AbstractRunnable() { @Override public void onFailure(Throwable t) { logger.debug("merge failure action rejected", t); } @Override protected void doRun() throws Exception { MergePolicy.MergeException e = new MergePolicy.MergeException(exc, dir); failEngine("merge failed", e); } }); } } } }