package outbackcdx; import org.apache.commons.codec.binary.Base32; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.time.ZoneOffset; import java.time.format.DateTimeFormatter; import java.util.Date; import java.util.logging.Level; import java.util.logging.Logger; import java.util.Map; import static java.nio.charset.StandardCharsets.US_ASCII; import static java.nio.charset.StandardCharsets.UTF_8; /** * A CDX record which can be encoded to a reasonable space-efficient packed representation. * <p> * Records are encoded as two byte arrays called the key and the value. The record's key is designed to be bytewise * sorted and is simply the urlkey concatenated with the timestamp as a big-endian 64-bit value. * * <pre> * 0 urlkey.length urlkey.size + 8 * +--------------+-----------------------------+ * | ASCII urlkey | 64-bit big-endian timestamp | * +--------------+-----------------------------+ * </pre> * <p>Version 4 keys are extended to include the WARC filename and record offset and two NUL bytes. The first NUL used * to determine the length of filename (searching backwards from the end). The second is a flag indicating this is the * new key version. * <pre> * +---------+------------------+-----|----------+-----+---------------+ * | urlkey | 64-bit timestamp | NUL | filename | NUL | 64-bit offset | * +---------+------------------+-----|----------+-----+---------------+ * </pre> * <p> * The record's value consists of a static list fields packed using {@link outbackcdx.VarInt}. The first field in the * value is a schema version number to allow fields to be added or removed in later versions. */ public class Capture { private static final Logger log = Logger.getLogger(Capture.class.getName()); static final DateTimeFormatter arcTimeFormat = DateTimeFormatter.ofPattern("yyyyMMddHHmmss"); static final Base32 base32 = new Base32(); public String urlkey; public long timestamp; public String original; public String mimetype; public int status; public String digest; public long length; public String file; public long compressedoffset; public String redirecturl; public String robotflags; // Additional properties for CDX14 public long originalLength = -1; public long originalCompressedoffset = -1; public String originalFile = "-"; public Capture(Map.Entry<byte[], byte[]> entry) { this(entry.getKey(), entry.getValue()); } public Capture(byte[] key, byte[] value) { decodeKey(key); decodeValue(ByteBuffer.wrap(value)); } public Capture() { } public void decodeKey(byte[] key) { if (key.length > 8 && key[key.length - 9] == 0) { decodeKeyV4(key); } else { decodeKeyV0(key); } } private void decodeKeyV0(byte[] key) { urlkey = new String(key, 0, key.length - 8, US_ASCII); ByteBuffer keyBuf = ByteBuffer.wrap(key); timestamp = keyBuf.getLong(key.length - 8); } @SuppressWarnings("StatementWithEmptyBody") private void decodeKeyV4(byte[] key) { int i; for (i = key.length - 10; i >= 0 && key[i] != 0; i--); if (i <= 8) throw new IllegalArgumentException("bad key"); ByteBuffer keyBuf = ByteBuffer.wrap(key); urlkey = new String(key, 0, i - 8, US_ASCII); timestamp = keyBuf.getLong(i - 8); file = new String(key, i + 1, key.length - i - 10); compressedoffset = keyBuf.getLong(key.length - 8); } public static byte[] encodeKeyV0(String keyurl, long timestamp) { byte[] urlBytes = keyurl.getBytes(US_ASCII); ByteBuffer bb = ByteBuffer.allocate(urlBytes.length + 8); bb.order(ByteOrder.BIG_ENDIAN); bb.put(urlBytes); bb.putLong(timestamp); return bb.array(); } public static byte[] encodeKeyV4(String keyurl, long timestamp, String file, long offset) { byte[] urlBytes = keyurl.getBytes(US_ASCII); byte[] fileBytes = file.getBytes(UTF_8); ByteBuffer bb = ByteBuffer.allocate(urlBytes.length + 8 + 1 + fileBytes.length + 1 + 8); bb.order(ByteOrder.BIG_ENDIAN); bb.put(urlBytes); bb.putLong(timestamp); bb.put((byte)0); bb.put(fileBytes); bb.put((byte)0); bb.putLong(offset); return bb.array(); } public byte[] encodeKey() { return encodeKey(FeatureFlags.indexVersion()); } public byte[] encodeKey(int version) { switch (version) { case 0: case 1: case 2: case 3: return encodeKeyV0(urlkey, timestamp); case 4: return encodeKeyV4(urlkey, timestamp, file, compressedoffset); default: throw new IllegalArgumentException("unsupported version: " + 4); } } public void decodeValue(ByteBuffer bb) { int version = (int) VarInt.decode(bb); switch (version) { case 0: decodeValueV0(bb); break; case 1: decodeValueV1(bb); break; case 2: decodeValueV2(bb); break; case 3: decodeValueV3(bb); break; case 4: decodeValueV4(bb); break; default: throw new IllegalArgumentException("CDX encoding is too new (v" + version + ") only versions up to v4 are supported"); } } private void decodeValueV0(ByteBuffer bb) { original = VarInt.decodeAscii(bb); status = (int) VarInt.decode(bb); mimetype = VarInt.decodeAscii(bb); length = VarInt.decode(bb); digest = VarInt.decodeAscii(bb); file = VarInt.decodeAscii(bb); compressedoffset = VarInt.decode(bb); redirecturl = VarInt.decodeAscii(bb); robotflags = "-"; } private void decodeValueV1(ByteBuffer bb) { original = VarInt.decodeAscii(bb); status = (int) VarInt.decode(bb); mimetype = VarInt.decodeAscii(bb); length = VarInt.decode(bb); digest = base32.encodeAsString(VarInt.decodeBytes(bb)); file = VarInt.decodeAscii(bb); compressedoffset = VarInt.decode(bb); redirecturl = VarInt.decodeAscii(bb); robotflags = "-"; } private void decodeValueV2(ByteBuffer bb) { decodeValueV1(bb); robotflags = VarInt.decodeAscii(bb); } private void decodeValueV3(ByteBuffer bb) { decodeValueV2(bb); originalLength = VarInt.decode(bb); originalFile = VarInt.decodeAscii(bb); originalCompressedoffset = VarInt.decode(bb); } private void decodeValueV4(ByteBuffer bb) { original = VarInt.decodeAscii(bb); status = (int) VarInt.decode(bb); mimetype = VarInt.decodeAscii(bb); length = VarInt.decode(bb); digest = base32.encodeAsString(VarInt.decodeBytes(bb)); redirecturl = VarInt.decodeAscii(bb); robotflags = VarInt.decodeAscii(bb); originalLength = VarInt.decode(bb); originalFile = VarInt.decodeAscii(bb); originalCompressedoffset = VarInt.decode(bb); } public int sizeValue() { return sizeValue(FeatureFlags.indexVersion()); } public int sizeValue(int version) { switch (version) { case 3: return sizeValueV3(); case 4: return sizeValueV4(); default: throw new IllegalArgumentException("Unsupported version " + version); } } private int sizeValueV3() { return VarInt.size(3) + VarInt.sizeAscii(original) + VarInt.size(status) + VarInt.sizeAscii(mimetype) + VarInt.size(length) + VarInt.sizeBytes(base32.decode(digest)) + VarInt.sizeAscii(file) + VarInt.size(compressedoffset) + VarInt.sizeAscii(redirecturl) + VarInt.sizeAscii(robotflags) + VarInt.size(originalLength) + VarInt.sizeAscii(originalFile) + VarInt.size(originalCompressedoffset); } private int sizeValueV4() { return VarInt.size(4) + VarInt.sizeAscii(original) + VarInt.size(status) + VarInt.sizeAscii(mimetype) + VarInt.size(length) + VarInt.sizeBytes(base32.decode(digest)) + VarInt.sizeAscii(redirecturl) + VarInt.sizeAscii(robotflags) + VarInt.size(originalLength) + VarInt.sizeAscii(originalFile) + VarInt.size(originalCompressedoffset); } public void encodeValue(ByteBuffer bb) { encodeValue(bb, FeatureFlags.indexVersion()); } private void encodeValue(ByteBuffer bb, int version) { switch (version) { case 3: encodeValueV3(bb); break; case 4: encodeValueV4(bb); break; default: throw new IllegalArgumentException("Unsupported version " + version); } } private void encodeValueV3(ByteBuffer bb) { VarInt.encode(bb, 3); VarInt.encodeAscii(bb, original); VarInt.encode(bb, status); VarInt.encodeAscii(bb, mimetype); VarInt.encode(bb, length); VarInt.encodeBytes(bb, base32.decode(digest)); VarInt.encodeAscii(bb, file); VarInt.encode(bb, compressedoffset); VarInt.encodeAscii(bb, redirecturl); VarInt.encodeAscii(bb, robotflags); VarInt.encode(bb, originalLength); VarInt.encodeAscii(bb, originalFile); VarInt.encode(bb, originalCompressedoffset); } private void encodeValueV4(ByteBuffer bb) { VarInt.encode(bb, 4); VarInt.encodeAscii(bb, original); VarInt.encode(bb, status); VarInt.encodeAscii(bb, mimetype); VarInt.encode(bb, length); VarInt.encodeBytes(bb, base32.decode(digest)); VarInt.encodeAscii(bb, redirecturl); VarInt.encodeAscii(bb, robotflags); VarInt.encode(bb, originalLength); VarInt.encodeAscii(bb, originalFile); VarInt.encode(bb, originalCompressedoffset); } public byte[] encodeValue(int version) { ByteBuffer bb = ByteBuffer.allocate(sizeValue(version)); encodeValue(bb, version); return bb.array(); } public byte[] encodeValue() { return encodeValue(FeatureFlags.indexVersion()); } /** * Format as a CDX11 line, or CDX14, depending on what fields are present. */ public String toString() { StringBuilder out = new StringBuilder(); out.append(urlkey).append(" "); out.append(Long.toString(timestamp)).append(" "); out.append(original).append(" "); out.append(mimetype).append(" "); out.append(Integer.toString(status)).append(" "); out.append(digest).append(" "); out.append(redirecturl).append(" "); out.append(robotflags).append(" "); out.append(Long.toString(length)).append(" "); out.append(compressedoffset).append(" "); out.append(file); if (FeatureFlags.indexVersion() >= 3) { out.append(" "); if (originalLength > 0) { out.append(Long.toString(originalLength)).append(" "); } else { out.append("-").append(" "); } if (originalCompressedoffset > 0) { out.append(Long.toString(originalCompressedoffset)).append(" "); } else { out.append("-").append(" "); } out.append(originalFile); } return out.toString(); } public static Capture fromCdxLine(String line, UrlCanonicalizer canonicalizer) { String[] fields = line.split(" "); Capture capture = new Capture(); capture.timestamp = Long.parseLong(fields[1]); capture.original = fields[2]; capture.urlkey = canonicalizer.surtCanonicalize(capture.original); capture.mimetype = fields[3]; capture.status = fields[4].equals("-") ? 0 : Integer.parseInt(fields[4]); capture.digest = fields[5]; capture.redirecturl = fields[6]; if (fields.length >= 11) { // 11 fields: CDX N b a m s k r M S V g capture.robotflags = fields[7]; capture.length = fields[8].equals("-") ? 0 : Long.parseLong(fields[8]); capture.compressedoffset = Long.parseLong(fields[9]); capture.file = fields[10]; if (fields.length == 14 ) { // 14 fields: CDX N b a m s k r M S V g capture.originalLength = fields[11].equals("-") ? 0 : Long.parseLong(fields[11]); capture.originalCompressedoffset = fields[12].equals("-") ? 0 : Long.parseLong(fields[12]); capture.originalFile = fields[13]; } } else if (fields.length == 10) { // 10 fields: CDX N b a m s k r M V g capture.robotflags = fields[7]; capture.compressedoffset = Long.parseLong(fields[8]); capture.file = fields[9]; } else { // 9 fields: CDX N b a m s k r V g capture.robotflags = "-"; capture.compressedoffset = Long.parseLong(fields[7]); capture.file = fields[8]; } return capture; } public Date date() { return parseTimestamp(timestamp); } static final String PAD_TIMESTAMP = "00000000000000"; // we expect 14 chars public static Date parseTimestamp(long timestamp) { String timestampstr = Long.toString(timestamp); if (timestampstr.length() < 14) { log.log(Level.WARNING, "Padding timestamp shorter then 14 chars: " + timestampstr); timestampstr = timestampstr + PAD_TIMESTAMP.substring(timestampstr.length()); } return Date.from(LocalDateTime.parse(timestampstr, arcTimeFormat).toInstant(ZoneOffset.UTC)); } /** * Gets the value of a field by name. We support several names as pywb and wayback-cdx-server use different names. * * @throws IllegalArgumentException for unknown fields */ public Object get(String field) { switch (field) { case "urlkey": return urlkey; case "timestamp": return timestamp; case "url": case "original": return original; case "mime": case "mimetype": return mimetype; case "statuscode": case "status": return status; case "digest": return digest; case "redirecturl": case "redirect": return redirecturl; case "robotflags": return robotflags; case "length": return length; case "offset": return compressedoffset; case "filename": return file; case "originalLength": return (originalLength != -1) ? originalLength : "-"; case "originalOffset": return (originalCompressedoffset != -1) ? originalCompressedoffset : "-"; case "originalFilename": return originalFile; case "range": return "bytes=" + compressedoffset + "-" + (compressedoffset + length - 1); default: throw new IllegalArgumentException("no such capture field: " + field); } } }