package lsm.internal; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import java.util.Objects; import java.util.PriorityQueue; import java.util.Queue; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.LockSupport; import java.util.concurrent.locks.ReentrantLock; import com.google.common.base.Preconditions; import io.netty.buffer.ByteBuf; import io.netty.buffer.ByteBufAllocator; import io.netty.buffer.PooledByteBufAllocator; import io.netty.buffer.Unpooled; import io.netty.util.concurrent.RejectedExecutionHandler; import lsm.Current; import lsm.Engine; import lsm.LogWriter; import lsm.MemTable; import lsm.SSTable; import lsm.SSTableBuilder; import lsm.SeekingIterator; import lsm.TableCache; import lsm.Version; import lsm.VersionSet; import lsm.base.Compaction; import lsm.base.FileMetaData; import lsm.base.FileUtils; import lsm.base.InternalKey; import lsm.base.LookupKey; import lsm.base.LookupResult; import lsm.base.Options; import lsm.base.SeekingIteratorComparator; import lsm.base.Snapshot; public class EngineImpl implements Engine{ private final static String COMPACTION_THREAD_NAME = "compaction-thread"; /** * 引擎工作目录 */ private final File databaseDir; private final VersionSet versions; private final TableCache tableCache; /** * 指示是否关闭 */ private final AtomicBoolean shuttingDown; /** * 写入wal */ private LogWriter log; private MemTable memTable; private MemTable immutableMemTable; /** * 写任务队列 */ private Queue<WriteTask> queue; /** * key的比较器 */ private final Comparator<InternalKey> internalKeyComparator; // 执行compaction的线程 private final ExecutorService compactionExecutor; /** * 后台compaction异步操作 */ private Future<?> backgroundCompaction; /** * 此锁用于读、写、合并线程竞争引擎资源,保证在使用引擎一些状态时的线程安全性 */ private final ReentrantLock engineLock = new ReentrantLock(); /** * 用作compaction的等待和唤醒 */ private final Condition backgroundCondition = engineLock.newCondition(); /** * 一些待处理的无用文件,可能需要删除 */ private final List<Long> pendingFileNumbers = new ArrayList<>(); public EngineImpl(File databaseDir) throws IOException { // TODO this.shuttingDown = new AtomicBoolean(false); this.log = null; // 数据库路径文件初始化 this.databaseDir = databaseDir; // 比较器初始化 this.internalKeyComparator = Options.INTERNAL_KEY_COMPARATOR; // 跳表初始化 this.memTable = new MemTableImpl(); // 合并线程初始化,核心线程1,最大线程1,等待时间1day,阻塞队列SynchronousQueue,饱和策略Abort this.compactionExecutor = new ThreadPoolExecutor(1, 1, 1, TimeUnit.DAYS, new SynchronousQueue<>(), new CompactionThreadFactory(), new ThreadPoolExecutor.AbortPolicy()); // table cache初始化 this.tableCache = new TableCacheImpl(databaseDir, Options.TABLE_CACHE_SIZE); engineLock.lock(); try { // 检验current Current current = new CurrentImpl(databaseDir); Preconditions.checkState(current.isAvailable(), "current不可用"); // 建立versionSet this.versions = new VersionSetImpl(databaseDir, tableCache); // versionSet.recover 从持久化信息中导入版本信息 versions.recover(); // TODO 将所有未及时写入memtable的log文件排好序,重新写入memtable // TODO 更新版本 // 尝试触发一次compaction maybeCompaction(); } finally { engineLock.unlock(); } //初始化 合并写入队列 this.queue = new ConcurrentLinkedQueue<>(); } @Override public void put(byte[] key, byte[] value) throws IOException { // 将写任务积累到任务队列 WriteTask task = new WriteTask(Unpooled.wrappedBuffer(key), Unpooled.wrappedBuffer(value)); queue.offer(task); // 线程检查自己是否 既不是队首,也没完成任务,是则阻塞 while(queue.peek() != task && !task.isDone()) { // 线程将自己挂起 LockSupport.park(); } // 线程检查自己是否 已经完成任务,如果完成,则退出 if(task.isDone()) { return; } // 线程检查自己是否 未完成任务且是队首,是则进行合并操作 else if(!task.isDone() && queue.peek() == task) { // 新建batch List<WriteTask> writeBatch = new ArrayList<>(); // 在batch线程执行到此之前,进来的写线程都会挂起 // 合并队列中的任务到writeBatch while(!queue.isEmpty() && writeBatch.size() < Options.WRITE_BATCH_LIMIT_SIZE) { task = queue.poll(); writeBatch.add(task); } // 在batch线程执行到此之后,进来的写线程会出现新的队首 // 开始正式执行写入过程 engineLock.lock(); try { // 先确保空间足够 mkRoomForWrite(); // 获取序列号 long seq = versions.getLastSequence() + 1; // 更新versionSet序列号 long lastSeq = seq + writeBatch.size() - 1; versions.setLastSequence(lastSeq); //新建record数组 ByteBuf[] records = new ByteBuf[writeBatch.size()]; int index = 0; // 日志合并写入 for(Iterator<WriteTask> iter = writeBatch.iterator(); iter.hasNext();) { WriteTask t = iter.next(); long seqForRecord = seq; InternalKey internalKey = t.getInternalKey(seqForRecord++); // 合并record,相当于合并IO,减少磁盘IO次数 ByteBuf record = PooledByteBufAllocator.DEFAULT.buffer(); // 组织record信息,具体格式详见LogWriter接口注释 record.writeInt(internalKey.size()); record.writeBytes(internalKey.encode().slice()); record.writeInt(task.getValue().readableBytes()); record.writeBytes(task.getValue().slice()); records[index++] = record; } // 写入日志 log.addRecord(records, false); // 写mem table for(Iterator<WriteTask> iter = writeBatch.iterator(); iter.hasNext();) { WriteTask t = iter.next(); long seqForMem = seq; InternalKey internalKey = t.getInternalKey(seqForMem++); memTable.add(internalKey, t.getValue()); } } finally { engineLock.unlock(); } // 唤醒阻塞在队列中的线程 for(Iterator<WriteTask> iter = writeBatch.iterator(); iter.hasNext();) { WriteTask t = iter.next(); t.setDone(); // 避免线程自己唤醒自己 if(t.thread != Thread.currentThread()) { Preconditions.checkState(t.getThread().isAlive(), t.getThread().getName() + " 已死"); LockSupport.unpark(t.getThread()); } } } } @Override public byte[] get(byte[] key) throws Exception { // 包装Lookup key用于查找 ByteBuf keyBuf = Unpooled.wrappedBuffer(key); long lastSeq = versions.getLastSequence(); LookupKey lookupKey = new LookupKey(new InternalKey(keyBuf, lastSeq)); // 先获取快照,将读取操作发起时的版本作为快照 try(Snapshot snapshot = new Snapshot(versions.getCurrent())) { LookupResult result = null; // 此处加锁用于竞争memtable资源 engineLock.lock(); try { // 尝试从memtable中读取 result = memTable.get(lookupKey); if (result != null) { return result.getValueBytes(); } // 尝试从immutable memtable中读取 if (immutableMemTable != null) { result = immutableMemTable.get(lookupKey); if (result != null) { return result.getValueBytes(); } } } finally { engineLock.unlock(); } // 尝试从快照中读取,即从文件中读取 result = snapshot.get(lookupKey); if (result != null) { return result.getValueBytes(); } return null; } } /** * 为写线程确保空间,适当地为合并线程让出系统资源 * @throws IOException */ private void mkRoomForWrite() throws IOException { // 确保写入线程持有锁 Preconditions.checkState(engineLock.isHeldByCurrentThread()); while(true) { // 当level0 文件数量超过slow down 指标时,慢行// 当level0 文件数量超过slow down 指标时,慢行 if(versions.getCurrent().files(0) > Options.L0_SLOW_DOWN_COUNT) { try { // 延缓生产 // 释放锁,交由写线程和compaction线程竞争 engineLock.unlock(); Thread.sleep(1); } catch (InterruptedException e) { e.printStackTrace(); } finally { // 重新竞争锁 engineLock.lock(); } } // 当memtable未超过限制,跳出循环 if(memTable.size() < Options.MEMTABLE_LIMIT) { break; } // 如果immutable不为空,说明上一次minor compaction未结束,需要等待 if(immutableMemTable != null) { backgroundCondition.awaitUninterruptibly(); } // 如果level0文件数量超过限制,需要等待major compaction if(versions.getCurrent().files(0) > Options.LEVEL0_LIMIT_COUNT) { backgroundCondition.awaitUninterruptibly(); } // 如果mem table达到阈值,则进行minor compaction else if(memTable.size() >= Options.MEMTABLE_LIMIT) { // 关闭旧的log,开启新的log log.close(); // 获取文件编号 long logNumber = versions.getNextFileNumber(); // 新建log log = new LogWriterImpl(databaseDir, logNumber, false); // 新建memtable,原有的memtable转变为immutable memtable immutableMemTable = memTable; memTable = new MemTableImpl(); // 触发compaction maybeCompaction(); } } } /** * 尝试触发compact,是异步操作 */ private void maybeCompaction() { // 不应compact的判断 boolean shouldNotStartCompact = backgroundCompaction != null // 上次后台合并未结束 || shuttingDown.get() // 系统关闭 || (immutableMemTable == null && !versions.needsCompaction());// 在内存没有数据需要序列化的情况下,versionSet中未显示需要compact if (!shouldNotStartCompact) { // 启动compact线程 // 启动后台compaction线程 backgroundCompaction = compactionExecutor.submit(new Callable<Void>() { @Override public Void call() throws Exception { engineLock.lock(); try { // 执行compaction backgroundCompaction(); } catch (IOException e) { e.printStackTrace(); } finally { // 再次触发compact try { maybeCompaction(); } finally { // 释放锁,唤醒等待compaction的线程 try { backgroundCondition.signalAll(); } finally { engineLock.unlock(); } } } return null; } }); } } /** * 选择执行minor compaction或者major compaction * @throws IOException */ private void backgroundCompaction() throws IOException { // 尝试序列化memtable,优先级最高 serializeMemTable(); // 获取compaction信息 Compaction compaction = versions.pickCompaction(); if (compaction != null) { // 是minor compact if (compaction.isMinorCompaction()) { minorCompact(compaction); } // 是major compact else { majorCompact(compaction); } } } /** * 在判断合适后,将memtable序列化成sstable * * @throws IOException */ private void serializeMemTable() throws IOException { // compaction加锁 engineLock.lock(); try { // immutable memtable不存在或为空,则说明不需要序列化 if (immutableMemTable == null || immutableMemTable.isEmpty()) { return; } // 新建version和versionEdit VersionEdit edit = new VersionEdit(); Version version = versions.getCurrent(); // 将memTable中的数据持久化到sstable FileMetaData fileMetaData = memToSSTable(immutableMemTable); // 更新version及versionEdit信息 if (fileMetaData != null && fileMetaData.getFileSize() > 0) { // 获取最值 // ByteBuf minUserKey = fileMetaData.getSmallest().getKey(); // ByteBuf maxUserKey = fileMetaData.getLargest().getKey(); // immutable memtable序列化文件直接放入0层 int level = 0; // if (version != null) { // level = version.pickLevelForMemTableOutput(minUserKey, maxUserKey); // } // 增加文件信息 edit.addFile(level, fileMetaData); } // 设置日志编号 edit.setLogNumber(log.getFileNumber()); // 应用versionEdit versions.logAndApply(edit); // 清空immutableMemTable immutableMemTable = null; // 处理pending文件 handlePendingFiles(); } finally { try { // 唤醒等待compaction的线程 backgroundCondition.signalAll(); } finally { // 释放compaction锁 engineLock.unlock(); } } } /** * 执行minor compact * 将文件移动到上面一层即可 * @param compaction * compact信息 * @throws IOException */ private void minorCompact(Compaction compaction) throws IOException { Objects.requireNonNull(compaction); FileMetaData fileMetaData = compaction.getLevelInputs().get(0); compaction.getEdit().deleteFile(compaction.getLevel(), fileMetaData.getNumber()); compaction.getEdit().addFile(compaction.getLevel() + 1, fileMetaData); versions.logAndApply(compaction.getEdit()); } /** * 执行major compact * * @param compaction * @throws IOException */ private void majorCompact(Compaction compaction) throws IOException { Objects.requireNonNull(compaction); // 新建major compaction helper MajorCompactionHelper helper = new MajorCompactionHelper(compaction); // 获取低一层level信息 List<FileMetaData> levelInputs = compaction.getLevelInputs(); // 获取高一层level信息 List<FileMetaData> levelUpInputs = compaction.getLevelUpInputs(); // 建立优先队列的比较器 Comparator<SeekingIterator<ByteBuf, ByteBuf>> comparator = new SeekingIteratorComparator(internalKeyComparator); // 新建优先队列,优先队列以迭代器的当前元素大小来排序 int sorterSize = levelInputs.size() + levelUpInputs.size(); PriorityQueue<SeekingIterator<ByteBuf, ByteBuf>> sorter = new PriorityQueue(sorterSize, comparator); // 将所有sstable的迭代器加入到优先队列中 levelInputs.forEach(fileMetaData -> { // 根据file获取sstable SSTable sstable = null; try { sstable = tableCache.getTable(fileMetaData.getNumber()); } catch (Exception e) { e.printStackTrace(); } Preconditions.checkNotNull(sstable); // 获取sstable中的迭代器并放入优先队列 SeekingIterator<ByteBuf, ByteBuf> iter = sstable.iterator(); sorter.add(iter); }); // 同上 levelUpInputs.forEach(fileMetaData -> { // 根据file获取sstable SSTable sstable = null; try { sstable = tableCache.getTable(fileMetaData.getNumber()); } catch (Exception e) { e.printStackTrace(); } Preconditions.checkNotNull(sstable); // 获取sstable中的迭代器并放入优先队列 SeekingIterator<ByteBuf, ByteBuf> iter = sstable.iterator(); sorter.add(iter); }); // major compaction时释放锁 engineLock.unlock(); try { // 记录上一个internalKey,用来判断key是否重复 InternalKey lastKey = null; // 优先队列中至少需要两个迭代器 while (sorter.size() > 0) { // memtable的序列化拥有最高优先级,因为需要保证用户能不断写入数据 engineLock.lock(); try { serializeMemTable(); } catch (IOException e) { e.printStackTrace(); } finally { engineLock.unlock(); } // 取出优先队列中的当前数据最小的迭代器 SeekingIterator<ByteBuf, ByteBuf> iter = sorter.poll(); // 取出迭代器当前位置数据,并移动到下一个位置 Entry<ByteBuf, ByteBuf> entry = iter.next(); InternalKey thisKey = InternalKey.decode(entry.getKey()); // 如果数据重复,抛弃,否则加入到sstable中 if (lastKey != null && lastKey.getUserKey().equals(thisKey.getUserKey())) { // 抛弃 } else { // 不抛弃,处理数据 // 新建builder if(!helper.hasBuilder()) { helper.newSSTable(); } // 插入数据到builder helper.addToSSTable(entry.getKey(), entry.getValue()); // builder已满,则终止插入,重置builder if(helper.isFull()) { helper.finishSSTable(); } // 更新lastKey lastKey = thisKey; } // 如果迭代器到头了,则不放回,否则放回 if (iter.hasNext()) { sorter.add(iter); } } // 终止builder helper.finishSSTable(); } finally { engineLock.lock(); } //将新建文件信息加入到version中 helper.installSSTable(); } /** * memTable数据持久化到sstable * @param mem memTable * @return 持久化得到文件的信息 * @throws IOException */ private FileMetaData memToSSTable(MemTable mem) throws IOException { // 必须在compactionLock锁住的临界区中 if(!engineLock.isHeldByCurrentThread()) { return null; } // 跳过空memtable if (mem.isEmpty()) { return null; } // 新建sst文件 long fileNumber = versions.getNextFileNumber(); File file = FileUtils.newSSTableFile(databaseDir, fileNumber); FileChannel channel = new RandomAccessFile(file, "rw").getChannel(); // 最值 InternalKey smallest = null; InternalKey largest = null; // TODO SSTableBuilder tableBuilder = null; engineLock.lock(); try { // 遍历迭代器 Iterator<Entry<InternalKey, ByteBuf>> iter = mem.iterator(); while (iter.hasNext()) { Entry<InternalKey, ByteBuf> entry = iter.next(); InternalKey key = entry.getKey(); // 设置最值 if (smallest == null) { smallest = key; } largest = key; // 将数据加入table中 tableBuilder.add(entry); } // 完成table的构造 tableBuilder.finish(); } finally { engineLock.unlock(); } // 获取sstable文件信息 FileMetaData fileMetaData = new FileMetaData(fileNumber, file.length(), smallest, largest); return fileMetaData; } /** * 处理pending的file */ private void handlePendingFiles() { //TODO } /** * major compaction过程中的辅助类 * @author bird */ private class MajorCompactionHelper{ private final Compaction compaction; //compaction后输出的若干file private final List<FileMetaData> fileMetaDatas = new ArrayList<>(); // 当前sstable private FileChannel channel; private SSTableBuilder builder; // 当前文件信息 private long currentFileNumber; private long currentFileSize; private InternalKey currentSmallest; private InternalKey currentLargest; // 所有新建文件的总大小 private long totalBytes; MajorCompactionHelper(Compaction compaction){ this.compaction = compaction; } /** * 检验是否存在builder * @return */ boolean hasBuilder() { return builder == null; } /** * 判断sstable是否达到容量限制 * @return * @throws IOException */ boolean isFull() throws IOException { // 比较当前sstable文件大小和限制大小 return builder.getFileSize() >= Compaction.MAX_FILE_SIZE; } /** * 新建sstable * @throws IOException */ void newSSTable() throws IOException { engineLock.lock(); try { // 初始化文件信息 currentFileNumber = versions.getNextFileNumber(); pendingFileNumbers.add(currentFileNumber); currentFileSize = 0; currentSmallest = null; currentLargest = null; // 新建文件加入pending pendingFileNumbers.add(currentFileNumber); // 初始化sstable文件和sstable builder File file = FileUtils.newSSTableFile(databaseDir, currentFileNumber); channel = new RandomAccessFile(file, "rw").getChannel(); //TODO builder = null; } finally { engineLock.unlock(); } } /** * 数据添加到sstable中 * @param key 键 * @param value 值 * @throws IOException */ void addToSSTable(ByteBuf key, ByteBuf value) throws IOException { // 更新最值 if(currentSmallest == null) { currentSmallest = InternalKey.decode(key); } currentLargest =InternalKey.decode(key); // 插入数据到sstable builder.add(key, value); } /** * 完成sstable * @throws IOException */ void finishSSTable() throws IOException { if(builder == null) return; // 结束build builder.finish(); // 更新文件信息 currentFileSize = builder.getFileSize(); totalBytes += currentFileSize; // 整理文件信息 FileMetaData currentFileMetaData = new FileMetaData(currentFileNumber,currentFileSize, currentSmallest, currentLargest); fileMetaDatas.add(currentFileMetaData); // 清空builder builder = null; // fileChannel刷到硬盘,关闭channel并清空 channel.force(true); channel.close(); channel = null; } /** * sstable信息记录到version中 * @throws IOException */ void installSSTable() throws IOException { VersionEdit edit = compaction.getEdit(); // 将刚刚参与归并的sstable文件作为待删除文件,加入到compaction中 compaction.deleteFiles(compaction.getEdit()); int level = compaction.getLevel(); // 新建的文件应当加到更上一层中 for(FileMetaData newFile : fileMetaDatas) { edit.addFile(level + 1, newFile); // 从pending列表中移除 pendingFileNumbers.remove(newFile.getNumber()); } //记录并且应用versionEdit versions.logAndApply(edit); // 删除失效文件 handlePendingFiles(); } } /** * @author bird * 代表一个写任务,包装了处理成bytebuf的key-value */ private static class WriteTask{ private final ByteBuf key; private final ByteBuf value; private final Thread thread; private InternalKey internalKey; private boolean done; public WriteTask(ByteBuf key, ByteBuf value) { this.key = key; this.value = value; this.thread = Thread.currentThread(); this.done = false; } public ByteBuf getKey() { return key; } public ByteBuf getValue() { return value; } public InternalKey getInternalKey(long seq) { if(internalKey == null) { internalKey = new InternalKey(key, seq); } return internalKey; } public void setDone() { this.done = true; } public boolean isDone() { return done; } public Thread getThread() { return thread; } } private static class CompactionThreadFactory implements ThreadFactory{ @Override public Thread newThread(Runnable r) { // 规定线程名称 Thread t = new Thread(COMPACTION_THREAD_NAME); // 处理未捕获的异常 t.setUncaughtExceptionHandler((thread, e) -> { e.printStackTrace(); }); return t; } } }