package com.indeed.mph; import com.google.common.io.CountingOutputStream; import com.google.common.io.LittleEndianDataOutputStream; import com.indeed.mph.serializers.SmartStringSerializer; import com.indeed.util.core.Pair; import com.indeed.util.io.BufferedFileDataOutputStream; import com.indeed.util.mmap.DirectMemory; import com.indeed.util.mmap.MMapBuffer; import it.unimi.dsi.bits.AbstractBitVector; import it.unimi.dsi.bits.BitVector; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.bits.TransformationStrategy; import it.unimi.dsi.sux4j.bits.HintedBsearchSelect; import it.unimi.dsi.sux4j.bits.Rank9; import it.unimi.dsi.sux4j.bits.Select; import it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction; import org.apache.log4j.Logger; import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.DataOutput; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.SequenceInputStream; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.FileChannel; import java.util.Arrays; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; /** * Utility class to write mph tables to disk. * <p> * The static methods write and writeWithTempStorage are all you need * to write an Iterable or Iterator of key-value * {@link com.indeed.util.core.Pair}s to a directory, given a config. * The minimum config is just the key serializer - see TableConfig for * details. Example: * <p> * <code> * final TableConfig<Long, Long> config = * new TableConfig() * .withKeySerializer(new SmartLongSerializer()) * .withValueSerializer(new SmartVLongSerializer()); * final Set<Pair<Long, Long>> entries = new HashSet<>(); * for (long i = 0; i < 20; ++i) { * entries.add(new Pair(i, i * i)); * } * TableWriter.write(new File("squares"), config, entries); * </code> * <p> * As a convenience, the main method lets you build tables from TSV files: * <p> * java com.indeed.recommend.common.platform.mph.TableWriter [options] output_dir input.tsv... * <p> * where options are: * <p> * --separator: separator to use instead of tab * --keySerializer: full class name of the key serializer (default .SmartStringSerializer) * --valueSerializer: full class name of the value serializer (default .SmartStringSerializer) * --keyStorage: set to IMPLICIT to remove keys from table * --offsetStorage: override the default choice * --rangeChecking: set to AUTOMATIC to enable tracking of min/max keys * --maxHeapUsage: the limit beyond which offsets are mmapped instead of being stored in the heap * --signatureWidth: bits per key to use in a bloom filter (required for IMPLICIT keyStorage) * <p> * Serializers in the com.indeed.mph.serializers package can be * abbreviated with a leading ".", e.g. ".SmartStringSerializer". * <p> * The serializer syntax also allows simple expressions of the form * Class(args...), notably useful for generic serializers, e.g.: * <p> * ".SmartListSerializer(.SmartLongSerializer, 20)" * <p> * is a serializer for lists of up to 20 Longs. * * @author alexs */ public class TableWriter { private static final Logger LOGGER = Logger.getLogger(TableWriter.class); public static final int MAX_TEMP_SHARDS = 512; private TableWriter() {} /** * Constructs and writes a minimal perfect hash table to * outputDir. The entries may be iterated over multiple times. * * @param <K> key type * @param <V> value type * @param outputDir directory to write the hash table files to * @param config a {@link TableConfig} specifying at least a key serializer * @param entries an iterable of key-value Pairs representing entries in the table * @throws IOException if unable to write the files or serialize the data */ public static <K, V> void write( final File outputDir, final TableConfig<K, V> config, final Iterable<Pair<K, V>> entries) throws IOException { long dataSize = 0; for (final Pair<K, V> e : entries) { if (e.getFirst() == null || (e.getSecond() == null && config.getValueSerializer() != null)) { throw new IllegalArgumentException("can't store nulls: " + e); } dataSize += config.sizeOf(e.getFirst(), e.getSecond()); } write(outputDir, config, entries, dataSize); } /** * As above, with a pre-computed dataSize. * * @param <K> key type * @param <V> value type * @param outputDir directory to write the hash table files to * @param config a {@link TableConfig} specifying at least a key serializer * @param entries an iterable of key-value Pairs representing entries in the table * @param dataSize the final size of the raw serialized data in the table * (omitting keys if using implicit storage, and omitting values if not used) * @throws IOException if unable to write the files or serialize the data */ public static <K, V> void write( final File outputDir, final TableConfig<K, V> config, final Iterable<Pair<K, V>> entries, final long dataSize) throws IOException { if (!config.isValid()) { throw new IOException("invalid table config: " + config); } ensureOutputDirectory(outputDir); final TransformationStrategy transformationStrategy = new SerializerTransformationStrategy(config.getKeySerializer()); GOVMinimalPerfectHashFunction<K> mph = null; final List<K> minMaxKeys = new ArrayList<>(); minMaxKeys.add(null); minMaxKeys.add(null); try { final boolean trackMinMaxKeys; switch (config.getRangeChecking()) { case MIN_AND_MAX: final Class keyClass = entries.iterator().next().getFirst().getClass(); final boolean isComparable = Comparable.class.isAssignableFrom(keyClass); if (!isComparable) { throw new IllegalArgumentException("can't track ranges for non-comparable type: " + keyClass); } trackMinMaxKeys = true; break; case AUTOMATIC: trackMinMaxKeys = Comparable.class.isAssignableFrom(entries.iterator().next().getFirst().getClass()); break; default: trackMinMaxKeys = false; break; } mph = new GOVMinimalPerfectHashFunction.Builder<K>() .transform(transformationStrategy) .signed(config.getSignatureWidth()) .keys(trackMinMaxKeys ? new PairFirstRangeTrackingIterable(entries, minMaxKeys) : new PairFirstIterable(entries)) .build(); } catch (final IllegalArgumentException e) { if (e.getMessage() != null && e.getMessage().contains("duplicate") && config.getDebugDuplicateKeys()) { throw newDuplicateKeyException(entries, (SmartSerializer<K>) config.getKeySerializer(), e); } throw e; } LOGGER.info("dataSize: " + dataSize + " numEntries: " + mph.size()); writeWithMinimalPerfectHashFunction(null, outputDir, config, entries, mph, minMaxKeys, dataSize); } /** * As above, using a one-time iterator. The entries are written * to local temp data, making this suitable for use e.g. when * reading from a slow source such as hdfs. * * @param <K> key type * @param <V> value type * @param outputDir directory to write the hash table files to * @param config a {@link TableConfig} specifying at least a key serializer * @param entries an iterable of key-value Pairs representing entries in the table * @param tempDir directory to write temporary files to * @throws IOException if unable to write the files or serialize the data */ public static <K, V> void writeWithTempStorage( final File outputDir, final TableConfig<K, V> config, final Iterator<Pair<K, V>> entries, final File tempDir) throws IOException { if (!config.isValid()) { throw new IOException("invalid table config: " + config); } ensureOutputDirectory(tempDir); final File tempDataFile = File.createTempFile("tmp_entries", ".bin", tempDir); final BufferedFileDataOutputStream fileOut = new BufferedFileDataOutputStream(tempDataFile); long dataSize = 0; try (final LittleEndianDataOutputStream out = new LittleEndianDataOutputStream(fileOut)) { while (entries.hasNext()) { final Pair<K, V> e = entries.next(); if (e.getFirst() == null || (e.getSecond() == null && config.getValueSerializer() != null)) { throw new IllegalArgumentException("can't store nulls: " + e); } dataSize += config.sizeOf(e.getFirst(), e.getSecond()); config.getKeySerializer().write(e.getFirst(), out); // write here even if implicit if (config.getValueSerializer() != null) { config.getValueSerializer().write(e.getSecond(), out); } } } try { final Iterable<Pair<K, V>> tempEntries = new SerializedKeyValueIterable(tempDataFile, config.getKeySerializer(), config.getValueSerializer()); write(outputDir, config, tempEntries, dataSize); } finally { tempDataFile.delete(); } } /** * As above, using the outputDir for temp storage. * * @param <K> key type * @param <V> value type * @param outputDir directory to write the hash table files to * @param config a {@link TableConfig} specifying at least a key serializer * @param entries an iterable of key-value Pairs representing entries in the table * @throws IOException if unable to write the files or serialize the data */ public static <K, V> void writeWithTempStorage(final File outputDir, final TableConfig<K, V> config, final Iterator<Pair<K, V>> entries) throws IOException { writeWithTempStorage(outputDir, config, entries, outputDir); } public static void ensureOutputDirectory(final File outputDir) throws IOException { if (outputDir.exists()) { if (!outputDir.isDirectory()) { throw new IOException("can't overwrite regular file with directory: " + outputDir); } } else { outputDir.mkdirs(); if (!outputDir.exists()) { throw new IOException("couldn't create directory: " + outputDir); } } } private static <K, V> void writeWithMinimalPerfectHashFunction( final File inputData, final File outputDir, final TableConfig origConfig, final Iterable<Pair<K, V>> entries, final GOVMinimalPerfectHashFunction<K> mph, final List<K> minMaxKeys, final long dataSize) throws IOException { final TableConfig config = TableConfig.OffsetStorage.AUTOMATIC.equals(origConfig.getOffsetStorage()) ? origConfig.withOffsetStorage(origConfig.chooseBestOffsetStorage(mph.size(), dataSize)) : origConfig; final byte[] minKey = maybeSerializeKey(config, minMaxKeys.get(0)); final byte[] maxKey = maybeSerializeKey(config, minMaxKeys.get(1)); final TableMeta<K, V> meta; switch (config.getOffsetStorage()) { case FIXED: LOGGER.info("writing with fixed offset storage: " + config); meta = new TableMeta(config, mph, null, minKey, maxKey, dataSize); writeToHashOffsets(outputDir, meta, entries, dataSize); break; case INDEXED: LOGGER.info("writing with indexed offset storage: " + config); meta = new TableMeta(config, mph, null, minKey, maxKey, dataSize); writeToIndexedOffsets(inputData, new File(outputDir, meta.DEFAULT_DATA_PATH), new File(outputDir, meta.DEFAULT_OFFSETS_PATH), meta, entries, dataSize); break; case SELECTED: LOGGER.info("writing with selected offset storage: " + config); final File sizes = writeToHashOffsets(outputDir, new TableMeta(config, mph, null, dataSize), entries, dataSize); final Select select = sizesToSelect(config, sizes, dataSize); sizes.delete(); if (select.bitVector() instanceof LongArrayBitVector && (config.getMaxHeapUsage() > 0 && select.numBits() / 8L > config.getMaxHeapUsage())) { meta = new TableMeta(config, mph, null, minKey, maxKey, dataSize); writeLongs(new File(outputDir, meta.DEFAULT_OFFSETS_PATH), select.bitVector().bits()); } else { meta = new TableMeta(config, mph, select, minKey, maxKey, dataSize); } break; default: throw new IllegalArgumentException("unknown offset storage: " + config.getOffsetStorage()); } meta.store(new File(outputDir, meta.DEFAULT_META_PATH)); } private static <K, V> byte[] maybeSerializeKey(final TableConfig<K, V> config, final K key) { if (key != null) { return serializeToBytes(config.getKeySerializer(), key).array(); } return null; } private static <K, V> File writeToHashOffsets( final File outputDir, final TableMeta<K, V> meta, final Iterable<Pair<K, V>> entries, final long dataSize) throws IOException { // integer serialized size of each entry by hash final File tempSizes = File.createTempFile("tmpsizes", ".bin"); // integer hash (offset from start of shard) of each entry by output order final File tempHashes = File.createTempFile("tmphashes", ".bin"); try (final MMapBuffer sizes = new MMapBuffer(tempSizes, 0L, 4L * meta.numEntries(), FileChannel.MapMode.READ_WRITE, ByteOrder.nativeOrder()); final MMapBuffer hashes = new MMapBuffer(tempHashes, 0L, 4L * meta.numEntries(), FileChannel.MapMode.READ_WRITE, ByteOrder.nativeOrder())) { final List<File> shards = splitToShards(outputDir, meta, entries, dataSize, sizes, hashes); rewriteShardsInOrder(new File(outputDir, meta.DEFAULT_DATA_PATH), meta, shards, sizes, hashes); } finally { tempHashes.delete(); } return tempSizes; } private static <K, V> List<File> splitToShards( final File outputDir, final TableMeta<K, V> meta, final Iterable<Pair<K, V>> entries, final long dataSize, final MMapBuffer sizes, final MMapBuffer hashes) throws IOException { final long requestedShardSize = meta.getConfig().getTempShardSize(); final int baseNumShards = Math.min(MAX_TEMP_SHARDS, (int) (1 + (dataSize / requestedShardSize))); final long shardSize = Math.max(1L, (meta.numEntries() + baseNumShards - 1) / baseNumShards); final int numShards = Math.max(1, (int) ((meta.numEntries() + shardSize - 1) / shardSize)); LOGGER.info("splitting " + dataSize + " bytes to " + numShards + " temp shards of " + shardSize + " entries each"); final List<File> shards = new ArrayList<>(numShards); final List<CountingOutputStream> counters = new ArrayList<>(numShards); final List<DataOutput> outs = new ArrayList<>(numShards); final List<Integer> counts = new ArrayList<>(numShards); final long startMillis = System.currentTimeMillis(); try { for (int i = 0; i < numShards; ++i) { final File shard = File.createTempFile("tmpshard_" + i, ".bin", outputDir); shards.add(shard); final CountingOutputStream counter = new CountingOutputStream(new BufferedFileDataOutputStream(shard)); counters.add(counter); outs.add(new LittleEndianDataOutputStream(new DataOutputStream(counter))); counts.add(0); } final DirectMemory sizesMemory = sizes.memory(); final DirectMemory hashesMemory = hashes.memory(); for (final Pair<K, V> e : entries) { final K key = e.getFirst(); final V value = e.getSecond(); final long hash = meta.getHash(key); final int shard = (int) (hash / shardSize); // not modulo final DataOutput out = outs.get(shard); final CountingOutputStream counter = counters.get(shard); final long offset = counter.getCount(); final int count = counts.get(shard); meta.getConfig().write(key, value, out); final int size = (int) (counter.getCount() - offset); sizesMemory.putInt(hash * 4, size); hashesMemory.putInt(((shard * shardSize) + count) * 4, (int) (hash - shard * shardSize)); counts.set(shard, count + 1); } } finally { for (final OutputStream out : counters) { if (out != null) { out.close(); } } } LOGGER.info("split " + numShards + " shards in " + (System.currentTimeMillis() - startMillis) + " ms"); return shards; } private static <K, V> void rewriteShardsInOrder( final File outputPath, final TableMeta<K, V> meta, final List<File> shards, final MMapBuffer sizes, final MMapBuffer hashes) throws IOException { final long startMillis = System.currentTimeMillis(); try (final DataOutputStream out = new DataOutputStream(new BufferedFileDataOutputStream(outputPath))) { final int numShards = shards.size(); final long shardSize = Math.max(1L, (meta.numEntries() + numShards - 1) / numShards); for (int i = 0; i < numShards; ++i) { final long start = i * shardSize; final long end = Math.min((i + 1) * shardSize, meta.numEntries()); try { rewriteShardInOrder(out, meta, shards.get(i), shardSize, sizes, hashes, start, end); } finally { shards.get(i).delete(); } } out.flush(); } outputPath.setReadOnly(); LOGGER.info("rewrote shards in " + (System.currentTimeMillis() - startMillis) + " ms"); } private static <K, V> void rewriteShardInOrder( final DataOutputStream out, final TableMeta<K, V> meta, final File shard, final long shardSize, final MMapBuffer sizes, // by hash final MMapBuffer hashes, // by output order in shard final long start, final long end) throws IOException { // compute offsets by hash final DirectMemory sizesMemory = sizes.memory(); final DirectMemory hashesMemory = hashes.memory(); final long[] offsets = new long[(int) (end - start)]; long offset = 0; int maxSize = 0; for (long i = start; i < end; ++i) { final int hash = hashesMemory.getInt(i * 4); offsets[hash] = offset; final int size = sizesMemory.getInt((hash + start) * 4); offset += size; if (size > maxSize) { maxSize = size; } } final byte[] tmpBuf = new byte[maxSize + 1]; try (final MMapBuffer inbuf = new MMapBuffer(shard, 0L, shard.length(), FileChannel.MapMode.READ_ONLY, ByteOrder.nativeOrder())) { final DirectMemory memory = inbuf.memory(); for (long i = start; i < end; ++i) { final long sourceOffset = offsets[(int) (i - start)]; final int sourceSize = sizesMemory.getInt(i * 4); memory.getBytes(sourceOffset, tmpBuf, 0, sourceSize); out.write(tmpBuf, 0, sourceSize); } } } private static <K, V> void writeToIndexedOffsets( final File inputData, final File outputData, final File outputOffsets, final TableMeta<K, V> meta, final Iterable<Pair<K, V>> entries, final long dataSize) throws IOException { final long numEntries = meta.numEntries(); final int offsetSize = meta.getConfig().bytesPerOffset(numEntries, dataSize); final long totalOffsetSize = numEntries * offsetSize; final BufferedFileDataOutputStream fileOut = new BufferedFileDataOutputStream(outputData); final CountingOutputStream countOut = new CountingOutputStream(fileOut); final long startMillis = System.currentTimeMillis(); try (final MMapBuffer offsets = new MMapBuffer(outputOffsets, 0L, totalOffsetSize, FileChannel.MapMode.READ_WRITE, ByteOrder.nativeOrder()); final LittleEndianDataOutputStream out = new LittleEndianDataOutputStream(countOut)) { for (final Pair<K, V> e : entries) { final long hash = meta.getHash(e.getFirst()); if (hash < 0) { throw new IOException("inconsistent mph, known key hashed to -1: " + e.getFirst()); } final long offset = countOut.getCount(); if (offsetSize == 2) { offsets.memory().putShort(hash * 2L, (short) offset); } else if (offsetSize == 4) { offsets.memory().putInt(hash * 4L, (int) offset); } else { offsets.memory().putLong(hash * 8L, offset); } meta.getConfig().write(e.getFirst(), e.getSecond(), out); } offsets.sync(0L, totalOffsetSize); out.flush(); } outputData.setReadOnly(); outputOffsets.setReadOnly(); LOGGER.info("wrote " + numEntries + " offsets for " + dataSize + " bytes of data in " + (System.currentTimeMillis() - startMillis) + " ms"); } private static <K, V> Select sizesToSelect(final TableConfig<K, V> config, final File tempSizes, final long dataSize) throws IOException { final long numEntries = tempSizes.length() / 4; try (final MMapBuffer sizes = new MMapBuffer(tempSizes, 0L, numEntries * 4, FileChannel.MapMode.READ_ONLY, ByteOrder.nativeOrder())) { final DirectMemory sizesMemory = sizes.memory(); final long maxValue = config.compressOffset(dataSize, numEntries); final BitVector bits = LongArrayBitVector.ofLength(maxValue); for (long i = 0, offset = 0; i < numEntries; offset += sizesMemory.getInt(i * 4), ++i) { final long value = config.compressOffset(offset, i); bits.set(value); } return new HintedBsearchSelect(new Rank9(bits)); } } private static void writeLongs(final File outputFile, final long[] values) throws IOException { try (final LittleEndianDataOutputStream out = new LittleEndianDataOutputStream(new BufferedFileDataOutputStream(outputFile))) { for (final long value : values) { out.writeLong(value); } out.flush(); } outputFile.setReadOnly(); } private static <K, V> RuntimeException newDuplicateKeyException(final Iterable<Pair<K, V>> entries, final SmartSerializer<K> serializer, final RuntimeException e) { // TODO: Consider a disk-based sort to detect dups. As-is, this // requires all keys (and their serialized forms) to fit in memory. LOGGER.error("attempting to find duplicate keys", e); K key1 = null; K key2 = null; ByteBuffer dupBytes = null; { final Map<ByteBuffer, K> serializedKeys = new HashMap<>(); for (final Pair<K, V> entry : entries) { final K key = entry.getFirst(); final ByteBuffer bytes = serializeToBytes(serializer, key); if (serializedKeys.containsKey(bytes)) { key1 = serializedKeys.get(bytes); key2 = key; dupBytes = bytes; LOGGER.error("found duplicate key: " + key1 + " == " + key2); break; } serializedKeys.put(bytes, key); } } if (key1 != null) { // Second pass to get associated values. V value1 = null; for (final Pair<K, V> entry : entries) { final K key = entry.getFirst(); final ByteBuffer bytes = serializeToBytes(serializer, key); if (bytes.equals(dupBytes)) { if (value1 == null) { value1 = entry.getSecond(); } else { return new IllegalArgumentException("Found duplicate key: " + Arrays.toString(bytes.array()) + ": " + key1 + " (" + value1 + ") == " + key2 + " (" + entry.getSecond() + ")", e); } } } } return new IllegalArgumentException("couldn't find duplicate keys", e); } private static <T> ByteBuffer serializeToBytes(final SmartSerializer<T> serializer, final T t) { final ByteArrayOutputStream byteOutput = new ByteArrayOutputStream(); final DataOutputStream dataOutput = new DataOutputStream(byteOutput); try { serializer.write(t, dataOutput); } catch (final IOException e) { throw new RuntimeException("failed to serialize: " + t, e); } return ByteBuffer.wrap(byteOutput.toByteArray()); } public static class SerializerTransformationStrategy<K> implements TransformationStrategy<K> { private static final long serialVersionUID = 8186081021441487460L; final SmartSerializer<K> serializer; public SerializerTransformationStrategy(final SmartSerializer<K> serializer) { this.serializer = serializer; } @Override public TransformationStrategy<K> copy() { return this; } @Override public long length(final K k) { return toBytes(k).length * 8; } @Override public long numBits() { return 0L; } @Override public BitVector toBitVector(final K k) { return new ByteArrayBitVector(toBytes(k)); } private byte[] toBytes(final K k) { final ByteArrayOutputStream byteOutput = new ByteArrayOutputStream(); final DataOutputStream dataOutput = new DataOutputStream(byteOutput); try { serializer.write(k, dataOutput); } catch (final IOException e) { throw new RuntimeException("failed to serialize: " + k, e); } final byte[] res = byteOutput.toByteArray(); return res; } private static class ByteArrayBitVector extends AbstractBitVector { final byte[] bytes; ByteArrayBitVector(final byte[] bytes) { this.bytes = bytes; } public boolean getBoolean(final int index) { return ((bytes[index / 8] >>> (index % 8)) & 1) == 1; } public boolean getBoolean(final long index) { return getBoolean((int) index); } public long length() { return size64(); } public long size64() { return bytes.length * 8; } } } public static class PairFirstIterable<K, V> implements Iterable<K> { private final Iterable<Pair<K, V>> iter; public PairFirstIterable(final Iterable<Pair<K, V>> iter) { this.iter = iter; } @Override public Iterator<K> iterator() { return new PairFirstIterator<>(iter.iterator()); } } public static class PairFirstIterator<K, V> implements Iterator<K> { private final Iterator<Pair<K, V>> iter; public PairFirstIterator(final Iterator<Pair<K, V>> iter) { this.iter = iter; } @Override public boolean hasNext() { return iter.hasNext(); } @Override public K next() { return iter.next().getFirst(); } @Override public void remove() { throw new UnsupportedOperationException(); } } public static class PairFirstRangeTrackingIterable<K extends Comparable<K>, V> implements Iterable<K> { private final Iterable<Pair<K, V>> iter; private final List<K> minMaxKeys; public PairFirstRangeTrackingIterable(final Iterable<Pair<K, V>> iter, final List<K> minMaxKeys) { this.iter = iter; this.minMaxKeys = minMaxKeys; } @Override public Iterator<K> iterator() { return new PairFirstRangeTrackingIterator<>(iter.iterator(), minMaxKeys); } } public static class PairFirstRangeTrackingIterator<K extends Comparable<K>, V> implements Iterator<K> { private final Iterator<Pair<K, V>> iter; private final List<K> minMaxKeys; public PairFirstRangeTrackingIterator(final Iterator<Pair<K, V>> iter, final List<K> minMaxKeys) { this.iter = iter; this.minMaxKeys = minMaxKeys; } @Override public boolean hasNext() { return iter.hasNext(); } @Override public K next() { final K key = iter.next().getFirst(); if (minMaxKeys.get(0) == null || key.compareTo(minMaxKeys.get(0)) < 0) { minMaxKeys.set(0, key); } if (minMaxKeys.get(1) == null || key.compareTo(minMaxKeys.get(1)) > 0) { minMaxKeys.set(1, key); } return key; } @Override public void remove() { throw new UnsupportedOperationException(); } } public static class SerializedKeyValueIterable<K, V> implements Iterable<Pair<K, V>> { private final File file; private final SmartSerializer keySerializer; private final SmartSerializer valueSerializer; public SerializedKeyValueIterable(final File file, final SmartSerializer keySerializer, final SmartSerializer valueSerializer) { this.file = file; this.keySerializer = keySerializer; this.valueSerializer = valueSerializer; } @Override public Iterator<Pair<K, V>> iterator() { try { final FileInputStream fileIn = new FileInputStream(file); final InputStream in = new BufferedInputStream(fileIn); return new SerializedKeyValueIterator<>(in, keySerializer, valueSerializer); } catch (final IOException e) { throw new IllegalArgumentException("can't iterate on file: " + file, e); } } } public static class TsvFileReader<K, V> implements Iterable<Pair<K, V>> { private final List<File> files; private final Parseable<K> keyParser; private final Parseable<V> valueParser; private final String separator; private final String replace; private final String to; private final double maxErrorRatio; public TsvFileReader(final List<File> files, final Parseable<K> keyParser, final Parseable<V> valueParser, final String separator, final String replace, final String to, final double maxErrorRatio) { this.files = files; this.keyParser = keyParser; this.valueParser = valueParser; this.separator = separator; this.replace = replace; this.to = to; this.maxErrorRatio = maxErrorRatio; } public TsvFileReader(final List<File> files, final Parseable<K> keyParser, final Parseable<V> valueParser, final String separator, final String replace, final String to, final boolean ignoreErrors) { this(files, keyParser, valueParser, separator, replace, to, ignoreErrors ? 1.0 : 0.0); } public TsvFileReader(final File file, final Parseable<K> keyParser, final Parseable<V> valueParser, final String separator, final String replace, final String to, final boolean ignoreErrors) { this(Collections.singletonList(file), keyParser, valueParser, separator, replace, to, ignoreErrors); } public TsvFileReader(final File file, final Parseable<K> keyParser, final Parseable<V> valueParser, final String separator, final String replace, final String to, final double maxErrorRatio) { this(Collections.singletonList(file), keyParser, valueParser, separator, replace, to, maxErrorRatio); } public TsvFileReader(final File file, final Parseable<K> keyParser, final Parseable<V> valueParser, final String separator) { this(file, keyParser, valueParser, separator, null, null, false); } @Override public Iterator<Pair<K, V>> iterator() { try { return new ParseableInputKeyValueIterator(makeSequenceFileInputStream(files), keyParser, valueParser, separator, replace, to, maxErrorRatio); } catch (final IOException e) { throw new IllegalArgumentException("can't iterate on file: " + files, e); } } private InputStream makeSequenceFileInputStream(final List<File> files) throws IOException { InputStream result = new FileInputStream(files.get(0)); for (int i = 1; i < files.size(); ++i) { result = new SequenceInputStream(result, new FileInputStream(files.get(i))); } return result; } } static Object parseSerializerArg(final String str) throws IOException { if (str.startsWith("\"")) { return str.substring(1, str.length() - 1); // TODO: handle escapes } else if ("true".equalsIgnoreCase(str)) { return true; } else if ("false".equalsIgnoreCase(str)) { return false; } try { if ('L' == str.charAt(str.length() - 1)) { return Long.parseLong(str.substring(0, str.length() - 1)); } else if ('f' == str.charAt(str.length() - 1)) { return Float.parseFloat(str.substring(0, str.length() - 1)); } else { return Integer.parseInt(str); } } catch (final Exception e) { return parseSerializerObject(str); } } static Object parseSerializerObject(final String name) throws IOException { if ("null".equalsIgnoreCase(name)) { return null; } final List<Object> args = new ArrayList<>(); final int paren = name.indexOf('('); if (paren >= 0) { int depth = 0; int from = paren + 1; for (int i = from; i < name.length(); ++i) { switch (name.charAt(i)) { case '(': ++depth; break; case ')': if (depth == 0) { args.add(parseSerializerArg(name.substring(from, i).trim())); i = name.length(); } else { --depth; } break; case ',': if (depth == 0) { final String arg = name.substring(from, i).trim(); if (!arg.isEmpty()) { args.add(parseSerializerArg(arg)); } from = i + 1; } break; case '"': for (++i; i < name.length(); ++i) { final char ch = name.charAt(i); if (ch == '"') { break; } else if (ch == '\\') { ++i; } } break; } } } final String baseName = paren >= 0 ? name.substring(0, paren) : name; final String fullName = baseName.startsWith(".") ? "com.indeed.mph.serializers" + baseName : baseName; try { for (final Constructor constructor : Class.forName(fullName).getConstructors()) { try { return constructor.newInstance(args.toArray(new Object[args.size()])); } catch (final InstantiationException | IllegalAccessException | InvocationTargetException | IllegalArgumentException e) { // try the next method } } } catch (final ClassNotFoundException e) { throw new IOException("unknown class: " + fullName); } throw new IOException("couldn't find a suitable SmartSerializer constructor: " + name); } public static SmartSerializer<Object> parseSerializer(final String name) throws IOException { final Object obj = parseSerializerObject(name); if (obj instanceof SmartSerializer) { return (SmartSerializer<Object>) obj; } throw new IOException("not a SmartSerializer: " + name + " -> " + obj); } public static void main(final String[] args) throws IOException { TableConfig<Object, Object> config = new TableConfig() .withKeySerializer(new SmartStringSerializer()) .withValueSerializer(new SmartStringSerializer()); Parseable<Object> keyParser = null; Parseable<Object> valueParser = null; String separator = "\t"; String replace = null; String to = ""; double maxErrorRatio = 0.0; boolean withTemp = false; int i = 0; parse_opts: for ( ; i < args.length && args[i].startsWith("-"); ++i) { switch (args[i]) { case "--": break parse_opts; case "--keyParser": keyParser = (Parseable<Object>) parseSerializerObject(args[++i]); break; case "--valueParser": valueParser = (Parseable<Object>) parseSerializerObject(args[++i]); break; case "--keySerializer": config = config.withKeySerializer(parseSerializer(args[++i])); break; case "--valueSerializer": config = config.withValueSerializer(parseSerializer(args[++i])); break; case "--keyStorage": config = config.withKeyStorage(TableConfig.KeyStorage.valueOf(args[++i])); break; case "--offsetStorage": config = config.withOffsetStorage(TableConfig.OffsetStorage.valueOf(args[++i])); break; case "--rangeChecking": config = config.withRangeChecking(TableConfig.RangeChecking.valueOf(args[++i])); break; case "--signatureWidth": config = config.withSignatureWidth(Integer.parseInt(args[++i])); break; case "--maxHeapUsage": config = config.withMaxHeapUsage(Long.parseLong(args[++i])); break; case "--separator": separator = args[++i]; break; case "--replace": replace = args[++i]; break; case "--to": to = args[++i]; break; case "--ignoreErrors": maxErrorRatio = 1.0; break; case "--maxErrorRatio": maxErrorRatio = Double.parseDouble(args[++i]); break; case "--withTempStorage": withTemp = true; break; default: throw new RuntimeException("unknown option: " + args[i]); } } if (args.length - i < 2) { throw new RuntimeException("usage: TableWriter [options] <output_dir> <input.tsv> ..."); } final File outputDir = new File(args[i]); final List<File> files = new ArrayList<>(); for (int j = i + 1; j < args.length; ++j) { files.add(new File(args[j])); } final Iterable<Pair<Object, Object>> reader = new TsvFileReader(files, ((keyParser != null) ? keyParser : config.getKeySerializer()), ((valueParser != null) ? valueParser : config.getValueSerializer()), separator, replace, to, maxErrorRatio); if (withTemp) { writeWithTempStorage(outputDir, config, reader.iterator(), outputDir); } else { write(outputDir, config, reader); } } }