/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ratis.server.raftlog.segmented; import org.apache.ratis.conf.RaftProperties; import org.apache.ratis.protocol.RaftGroupMemberId; import org.apache.ratis.protocol.RaftPeerId; import org.apache.ratis.server.RaftServerConfigKeys; import org.apache.ratis.server.impl.RaftServerImpl; import org.apache.ratis.server.impl.ServerProtoUtils; import org.apache.ratis.server.protocol.TermIndex; import org.apache.ratis.server.raftlog.RaftLog; import org.apache.ratis.server.raftlog.RaftLogIOException; import org.apache.ratis.server.storage.RaftStorage; import org.apache.ratis.server.raftlog.segmented.LogSegment.LogRecord; import org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogCache.TruncateIndices; import org.apache.ratis.server.storage.RaftStorageDirectory.LogPathAndIndex; import org.apache.ratis.proto.RaftProtos.LogEntryProto; import org.apache.ratis.statemachine.StateMachine; import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; import org.apache.ratis.util.AutoCloseableLock; import org.apache.ratis.util.JavaUtils; import org.apache.ratis.util.Preconditions; import org.apache.ratis.util.StringUtils; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.function.Consumer; import com.codahale.metrics.Timer; /** * The RaftLog implementation that writes log entries into segmented files in * local disk. * * The max log segment size is 8MB. The real log segment size may not be * exactly equal to this limit. If a log entry's size exceeds 8MB, this entry * will be stored in a single segment. * * There are two types of segments: closed segment and open segment. The former * is named as "log_startindex-endindex", the later is named as * "log_inprogress_startindex". * * There can be multiple closed segments but there is at most one open segment. * When the open segment reaches the size limit, or the log term increases, we * close the open segment and start a new open segment. A closed segment cannot * be appended anymore, but it can be truncated in case that a follower's log is * inconsistent with the current leader. * * Every closed segment should be non-empty, i.e., it should contain at least * one entry. * * There should not be any gap between segments. The first segment may not start * from index 0 since there may be snapshots as log compaction. The last index * in segments should be no smaller than the last index of snapshot, otherwise * we may have hole when append further log. */ public class SegmentedRaftLog extends RaftLog { /** * I/O task definitions. */ abstract static class Task { private final CompletableFuture<Long> future = new CompletableFuture<>(); private Timer.Context queueTimerContext; CompletableFuture<Long> getFuture() { return future; } void done() { completeFuture(); } final void completeFuture() { final boolean completed = future.complete(getEndIndex()); Preconditions.assertTrue(completed, () -> this + " is already " + StringUtils.completableFuture2String(future, false)); } void failed(IOException e) { this.getFuture().completeExceptionally(e); } abstract void execute() throws IOException; abstract long getEndIndex(); void startTimerOnEnqueue(Timer queueTimer) { queueTimerContext = queueTimer.time(); } void stopTimerOnDequeue() { if (queueTimerContext != null) { queueTimerContext.stop(); } } int getSerializedSize() { return 0; } @Override public String toString() { return getClass().getSimpleName() + ":" + getEndIndex(); } } /** The methods defined in {@link RaftServerImpl} which are used in {@link SegmentedRaftLog}. */ interface ServerLogMethods { ServerLogMethods DUMMY = new ServerLogMethods() {}; default boolean shouldEvictCache() { return false; } default long[] getFollowerNextIndices() { return null; } default long getLastAppliedIndex() { return INVALID_LOG_INDEX; } /** Notify the server that a log entry is being truncated. */ default void notifyTruncatedLogEntry(TermIndex ti) { } } /** * When the server is null, return the dummy instance of {@link ServerLogMethods}. * Otherwise, the server is non-null, return the implementation using the given server. */ private ServerLogMethods newServerLogMethods(RaftServerImpl impl) { if (impl == null) { return ServerLogMethods.DUMMY; } return new ServerLogMethods() { @Override public boolean shouldEvictCache() { return cache.shouldEvict(); } @Override public long[] getFollowerNextIndices() { return impl.getFollowerNextIndices(); } @Override public long getLastAppliedIndex() { return impl.getState().getLastAppliedIndex(); } @Override public void notifyTruncatedLogEntry(TermIndex ti) { try { final LogEntryProto entry = get(ti.getIndex()); impl.notifyTruncatedLogEntry(entry); } catch (RaftLogIOException e) { LOG.error("{}: Failed to read log {}", getName(), ti, e); } } }; } private final ServerLogMethods server; private final RaftStorage storage; private final StateMachine stateMachine; private final SegmentedRaftLogCache cache; private final SegmentedRaftLogWorker fileLogWorker; private final long segmentMaxSize; private final boolean stateMachineCachingEnabled; public SegmentedRaftLog(RaftGroupMemberId memberId, RaftServerImpl server, RaftStorage storage, long lastIndexInSnapshot, RaftProperties properties) { this(memberId, server, server != null? server.getStateMachine(): null, server != null? server::submitUpdateCommitEvent: null, storage, lastIndexInSnapshot, properties); } SegmentedRaftLog(RaftGroupMemberId memberId, RaftServerImpl server, StateMachine stateMachine, Runnable submitUpdateCommitEvent, RaftStorage storage, long lastIndexInSnapshot, RaftProperties properties) { super(memberId, lastIndexInSnapshot, properties); this.server = newServerLogMethods(server); this.storage = storage; this.stateMachine = stateMachine; segmentMaxSize = RaftServerConfigKeys.Log.segmentSizeMax(properties).getSize(); this.cache = new SegmentedRaftLogCache(memberId, storage, properties, getRaftLogMetrics()); this.fileLogWorker = new SegmentedRaftLogWorker(memberId, stateMachine, submitUpdateCommitEvent, server, storage, properties, getRaftLogMetrics()); stateMachineCachingEnabled = RaftServerConfigKeys.Log.StateMachineData.cachingEnabled(properties); } @Override protected void openImpl(long lastIndexInSnapshot, Consumer<LogEntryProto> consumer) throws IOException { loadLogSegments(lastIndexInSnapshot, consumer); File openSegmentFile = null; LogSegment openSegment = cache.getOpenSegment(); if (openSegment != null) { openSegmentFile = storage.getStorageDir() .getOpenLogFile(openSegment.getStartIndex()); } fileLogWorker.start(Math.max(cache.getEndIndex(), lastIndexInSnapshot), Math.min(cache.getLastIndexInClosedSegments(), lastIndexInSnapshot), openSegmentFile); } @Override public long getStartIndex() { return cache.getStartIndex(); } private void loadLogSegments(long lastIndexInSnapshot, Consumer<LogEntryProto> logConsumer) throws IOException { try(AutoCloseableLock writeLock = writeLock()) { List<LogPathAndIndex> paths = storage.getStorageDir().getLogSegmentFiles(); int i = 0; for (LogPathAndIndex pi : paths) { // During the initial loading, we can only confirm the committed // index based on the snapshot. This means if a log segment is not kept // in cache after the initial loading, later we have to load its content // again for updating the state machine. // TODO we should let raft peer persist its committed index periodically // so that during the initial loading we can apply part of the log // entries to the state machine boolean keepEntryInCache = (paths.size() - i++) <= cache.getMaxCachedSegments(); final Timer.Context loadSegmentContext = getRaftLogMetrics().getRaftLogLoadSegmentTimer().time(); cache.loadSegment(pi, keepEntryInCache, logConsumer, lastIndexInSnapshot); loadSegmentContext.stop(); } // if the largest index is smaller than the last index in snapshot, we do // not load the log to avoid holes between log segments. This may happen // when the local I/O worker is too slow to persist log (slower than // committing the log and taking snapshot) if (!cache.isEmpty() && cache.getEndIndex() < lastIndexInSnapshot) { LOG.warn("End log index {} is smaller than last index in snapshot {}", cache.getEndIndex(), lastIndexInSnapshot); cache.clear(); // TODO purge all segment files } } } @Override public LogEntryProto get(long index) throws RaftLogIOException { checkLogState(); final LogSegment segment; final LogRecord record; try (AutoCloseableLock readLock = readLock()) { segment = cache.getSegment(index); if (segment == null) { return null; } record = segment.getLogRecord(index); if (record == null) { return null; } final LogEntryProto entry = segment.getEntryFromCache(record.getTermIndex()); if (entry != null) { getRaftLogMetrics().onRaftLogCacheHit(); return entry; } } // the entry is not in the segment's cache. Load the cache without holding the lock. getRaftLogMetrics().onRaftLogCacheMiss(); checkAndEvictCache(); return segment.loadCache(record); } @Override public EntryWithData getEntryWithData(long index) throws RaftLogIOException { final LogEntryProto entry = get(index); if (entry == null) { throw new RaftLogIOException("Log entry not found: index = " + index); } if (!ServerProtoUtils.shouldReadStateMachineData(entry)) { return new EntryWithData(entry, null); } try { CompletableFuture<ByteString> future = null; if (stateMachine != null) { future = stateMachine.data().read(entry).exceptionally(ex -> { stateMachine.notifyLogFailed(ex, entry); return null; }); } return new EntryWithData(entry, future); } catch (Throwable e) { final String err = getName() + ": Failed readStateMachineData for " + ServerProtoUtils.toLogEntryString(entry); LOG.error(err, e); throw new RaftLogIOException(err, JavaUtils.unwrapCompletionException(e)); } } private void checkAndEvictCache() { if (server.shouldEvictCache()) { // TODO if the cache is hitting the maximum size and we cannot evict any // segment's cache, should block the new entry appending or new segment // allocation. cache.evictCache(server.getFollowerNextIndices(), fileLogWorker.getSafeCacheEvictIndex(), server.getLastAppliedIndex()); } } @Override public TermIndex getTermIndex(long index) { checkLogState(); try(AutoCloseableLock readLock = readLock()) { LogRecord record = cache.getLogRecord(index); return record != null ? record.getTermIndex() : null; } } @Override public TermIndex[] getEntries(long startIndex, long endIndex) { checkLogState(); try(AutoCloseableLock readLock = readLock()) { return cache.getTermIndices(startIndex, endIndex); } } @Override public TermIndex getLastEntryTermIndex() { checkLogState(); try(AutoCloseableLock readLock = readLock()) { return cache.getLastTermIndex(); } } @Override protected CompletableFuture<Long> truncateImpl(long index) { checkLogState(); try(AutoCloseableLock writeLock = writeLock()) { SegmentedRaftLogCache.TruncationSegments ts = cache.truncate(index); if (ts != null) { Task task = fileLogWorker.truncate(ts, index); return task.getFuture(); } } return CompletableFuture.completedFuture(index); } @Override protected CompletableFuture<Long> purgeImpl(long index) { try (AutoCloseableLock writeLock = writeLock()) { SegmentedRaftLogCache.TruncationSegments ts = cache.purge(index); LOG.debug("purging segments:{}", ts); if (ts != null) { Task task = fileLogWorker.purge(ts); return task.getFuture(); } } return CompletableFuture.completedFuture(index); } @Override protected CompletableFuture<Long> appendEntryImpl(LogEntryProto entry) { final Timer.Context context = getRaftLogMetrics().getRaftLogAppendEntryTimer().time(); checkLogState(); if (LOG.isTraceEnabled()) { LOG.trace("{}: appendEntry {}", getName(), ServerProtoUtils.toLogEntryString(entry)); } try(AutoCloseableLock writeLock = writeLock()) { validateLogEntry(entry); final LogSegment currentOpenSegment = cache.getOpenSegment(); if (currentOpenSegment == null) { cache.addOpenSegment(entry.getIndex()); fileLogWorker.startLogSegment(entry.getIndex()); } else if (isSegmentFull(currentOpenSegment, entry)) { cache.rollOpenSegment(true); fileLogWorker.rollLogSegment(currentOpenSegment); checkAndEvictCache(); } else if (currentOpenSegment.numOfEntries() > 0 && currentOpenSegment.getLastTermIndex().getTerm() != entry.getTerm()) { // the term changes final long currentTerm = currentOpenSegment.getLastTermIndex().getTerm(); Preconditions.assertTrue(currentTerm < entry.getTerm(), "open segment's term %s is larger than the new entry's term %s", currentTerm, entry.getTerm()); cache.rollOpenSegment(true); fileLogWorker.rollLogSegment(currentOpenSegment); checkAndEvictCache(); } // If the entry has state machine data, then the entry should be inserted // to statemachine first and then to the cache. Not following the order // will leave a spurious entry in the cache. CompletableFuture<Long> writeFuture = fileLogWorker.writeLogEntry(entry).getFuture(); if (stateMachineCachingEnabled) { // The stateMachineData will be cached inside the StateMachine itself. cache.appendEntry(ServerProtoUtils.removeStateMachineData(entry)); } else { cache.appendEntry(entry); } return writeFuture; } catch (Throwable throwable) { LOG.error("{}: Failed to append {}", getName(), ServerProtoUtils.toLogEntryString(entry), throwable); throw throwable; } finally { context.stop(); } } private boolean isSegmentFull(LogSegment segment, LogEntryProto entry) { if (segment.getTotalSize() >= segmentMaxSize) { return true; } else { final long entrySize = LogSegment.getEntrySize(entry); // if entry size is greater than the max segment size, write it directly // into the current segment return entrySize <= segmentMaxSize && segment.getTotalSize() + entrySize > segmentMaxSize; } } @Override public List<CompletableFuture<Long>> appendImpl(LogEntryProto... entries) { checkLogState(); if (entries == null || entries.length == 0) { return Collections.emptyList(); } try(AutoCloseableLock writeLock = writeLock()) { final TruncateIndices ti = cache.computeTruncateIndices(server::notifyTruncatedLogEntry, entries); final long truncateIndex = ti.getTruncateIndex(); final int index = ti.getArrayIndex(); LOG.debug("truncateIndex={}, arrayIndex={}", truncateIndex, index); final List<CompletableFuture<Long>> futures; if (truncateIndex != -1) { futures = new ArrayList<>(entries.length - index + 1); futures.add(truncate(truncateIndex)); } else { futures = new ArrayList<>(entries.length - index); } for (int i = index; i < entries.length; i++) { futures.add(appendEntry(entries[i])); } return futures; } } @Override public long getFlushIndex() { return fileLogWorker.getFlushIndex(); } @Override public void writeMetadata(long term, RaftPeerId votedFor) throws IOException { storage.getMetaFile().set(term, votedFor != null ? votedFor.toString() : null); } @Override public Metadata loadMetadata() throws IOException { return new Metadata( RaftPeerId.getRaftPeerId(storage.getMetaFile().getVotedFor()), storage.getMetaFile().getTerm()); } @Override public void syncWithSnapshot(long lastSnapshotIndex) { fileLogWorker.syncWithSnapshot(lastSnapshotIndex); // TODO purge log files and normal/tmp/corrupt snapshot files // if the last index in snapshot is larger than the index of the last // log entry, we should delete all the log entries and their cache to avoid // gaps between log segments. // Close open log segment if entries are already included in snapshot LogSegment openSegment = cache.getOpenSegment(); if (openSegment != null && openSegment.getEndIndex() <= lastSnapshotIndex) { fileLogWorker.closeLogSegment(openSegment); cache.clear(); } } @Override public boolean isConfigEntry(TermIndex ti) { return cache.isConfigEntry(ti); } @Override public void close() throws IOException { try(AutoCloseableLock writeLock = writeLock()) { super.close(); cache.clear(); } fileLogWorker.close(); storage.close(); getRaftLogMetrics().unregister(); } SegmentedRaftLogCache getRaftLogCache() { return cache; } @Override public String toString() { try(AutoCloseableLock readLock = readLock()) { if (isOpened()) { return super.toString() + ",f" + getFlushIndex() + ",i" + Optional.ofNullable(getLastEntryTermIndex()).map(TermIndex::getIndex).orElse(0L); } else { return super.toString(); } } } @Override public String toLogEntryString(LogEntryProto logEntry) { return ServerProtoUtils.toLogEntryString(logEntry, stateMachine::toStateMachineLogEntryString); } }