package com.digitalpebble.stormcrawler.warc; import java.io.IOException; import java.time.Instant; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.fs.Path; import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy; import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy.Units; import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy; import org.apache.storm.hdfs.common.AbstractHDFSWriter; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.tuple.Tuple; import org.apache.storm.utils.Utils; import com.digitalpebble.stormcrawler.protocol.ProtocolResponse; import com.digitalpebble.stormcrawler.util.ConfUtils; @SuppressWarnings("serial") public class WARCHdfsBolt extends GzipHdfsBolt { private Map<String, String> header_fields = new HashMap<>(); // by default remains as is-pre 1.17 private String protocolMDprefix = ""; private boolean withRequestRecords = false; public WARCHdfsBolt() { super(); FileSizeRotationPolicy rotpol = new FileSizeRotationPolicy(1.0f, Units.GB); withRotationPolicy(rotpol); // dummy sync policy withSyncPolicy(new CountSyncPolicy(10)); // default local filesystem withFsUrl("file:///"); } public WARCHdfsBolt withHeader(Map<String, String> header_fields) { this.header_fields = header_fields; return this; } public WARCHdfsBolt withRequestRecords() { withRequestRecords = true; return this; } @Override public void doPrepare(Map conf, TopologyContext topologyContext, OutputCollector collector) throws IOException { super.doPrepare(conf, topologyContext, collector); protocolMDprefix = ConfUtils.getString(conf, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, ""); withRecordFormat(new WARCRecordFormat(protocolMDprefix)); if (withRequestRecords) { addRecordFormat(new WARCRequestRecordFormat(protocolMDprefix), 0); } } @Override protected AbstractHDFSWriter makeNewWriter(Path path, Tuple tuple) throws IOException { AbstractHDFSWriter writer = super.makeNewWriter(path, tuple); Instant now = Instant.now(); // overrides the filename and creation date in the headers header_fields.put("WARC-Date", WARCRecordFormat.WARC_DF.format(now)); header_fields.put("WARC-Filename", path.getName()); byte[] header = WARCRecordFormat.generateWARCInfo(header_fields); // write the header at the beginning of the file if (header != null && header.length > 0) { super.out.write(Utils.gzip(header)); } return writer; } }