package mizo.hbase; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellComparator; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSUtils; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import java.util.stream.Collectors; /** * Created by imrihecht on 12/10/16. */ public class MizoRegionFamilyCellsIterator implements Iterator<Cell> { /** * A customized version of CellComparator.compare() method. * We cannot use CellComparator itself, because it is inconsistent - * If it compares row, qualifier, column family - it works fine. * If it compares timestamps - it returns the opposite. * Case A: two cells - first with qualifier 'A' and timestamp 100, second with qualifier 'B' and timestamp 200, * then the first ('A') will be returned first and then the second ('B') (Correct) * Case B: two cells with qualifier 'A', one with timestamp 100 and the other with timestamp 200, * then the one with 200 will be returned before the one with 100. (Wrong) * This causes inconsistency, and the following is a fixed one. */ private Comparator<Cell> ASC_CELL_COMPARATOR = (left, right) -> { int c = CellComparator.compareRows(left, right); if (c != 0) { return c; } else { if (left.getFamilyLength() + left.getQualifierLength() == 0 && left.getTypeByte() == KeyValue.Type.Minimum.getCode()) { return 1; } else if (right.getFamilyLength() + right.getQualifierLength() == 0 && right.getTypeByte() == KeyValue.Type.Minimum.getCode()) { return -1; } else { boolean sameFamilySize = left.getFamilyLength() == right.getFamilyLength(); if (!sameFamilySize) { return Bytes.compareTo(left.getFamilyArray(), left.getFamilyOffset(), left.getFamilyLength(), right.getFamilyArray(), right.getFamilyOffset(), right.getFamilyLength()); } else { int diff = CellComparator.compareColumns(left, right); if (diff != 0) { return diff; } else { diff = CellComparator.compareTimestamps(right, left); // Different from CellComparator.compare() return diff != 0 ? diff : (255 & right.getTypeByte()) - (255 & left.getTypeByte()); } } } } }; /** * Ascending sorted cells iterator - the most recent cell is the last */ private final PeekingIterator<Cell> sortedRegionIterator; public MizoRegionFamilyCellsIterator(String regionEdgesFamilyPath) throws IOException { sortedRegionIterator = createSortedHFilesIterator(regionEdgesFamilyPath); } /** * Creates an ascending-sorted cells iterator, wrapped by a peeking iterator * @param regionEdgesFamilyPath Path of the HBase directory that contains Titan's Edges column-family * @return Ascending-sorted cells iterator, wrapped by a peeking iterator */ protected PeekingIterator<Cell> createSortedHFilesIterator(String regionEdgesFamilyPath) throws IOException { return Iterators.peekingIterator( Iterators.mergeSorted(createHFilesIterators(regionEdgesFamilyPath), ASC_CELL_COMPARATOR) ); } /** * Given a path to HFiles, gets a list of the HFiles residing in the directory, * create a Cells iterator per each HFile and return a collection of these iterators, * removing iterators that have no items * @param regionEdgesFamilyPath Path to HFiles * @return Collection of non-empty iterators of the given HFiles */ protected Iterable<Iterator<Cell>> createHFilesIterators(String regionEdgesFamilyPath) throws IOException { Path path = new Path(regionEdgesFamilyPath); FileSystem fs = path.getFileSystem(new Configuration()); return Arrays.stream(fs.listStatus(path, new FSUtils.HFileFilter(fs))) .map(FileStatus::getPath) .map(hfilePath -> MizoHFileIterator.createIterator(fs, hfilePath)) .filter(Iterator::hasNext) .collect(Collectors.toList()); } /** * Returns true if any of the HFiles in this region has a cell left to read */ @Override public boolean hasNext() { return sortedRegionIterator.hasNext(); } /** * Popping out cells until reaching the most updated version of a cell, then returns it * @return Most updated version of the cell */ @Override public Cell next() { Cell mostUpdated = sortedRegionIterator.next(); while (sortedRegionIterator.hasNext() && equalsRowFamilyQualifier(mostUpdated, sortedRegionIterator.peek())) { mostUpdated = sortedRegionIterator.next(); } return mostUpdated; } /** * Checks if two cells are equal by comparing their Row, Family and Qualifier (NOT comparing their timestamp */ private boolean equalsRowFamilyQualifier(Cell left, Cell right) { return CellComparator.equalsRow(left, right) && CellComparator.equalsFamily(left, right) && CellComparator.equalsQualifier(left, right); } }