org.archive.io.ArchiveReader Java Examples

The following examples show how to use org.archive.io.ArchiveReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ARCReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
  	RandomAccessFile raf = new RandomAccessFile(testfile, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = testfile.getAbsolutePath();
ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);    	
// This one works:
//ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
ArchiveRecord record = reader.get();

final String url = record.getHeader().getUrl();
assertEquals("URL of record is not as expected.", uri, url);

      final long position = record.getPosition();
      final long recordLength = record.getHeader().getLength();
      assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);

      // Clean up:
      if( raf != null )
      	raf.close();
  }
 
Example #2
Source File: ARCReaderFactory.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
protected ArchiveReader getArchiveReader(final File arcFile,
           final boolean skipSuffixTest, final long offset)
   throws IOException {
       boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
       if (!compressed) {
           if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
                   ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
               throw new IOException(arcFile.getAbsolutePath() +
                   " is not an Internet Archive ARC file.");
           }
       }
       return compressed?
           (ARCReader)ARCReaderFactory.factory.
               new CompressedARCReader(arcFile, offset):
           (ARCReader)ARCReaderFactory.factory.
               new UncompressedARCReader(arcFile, offset);
}
 
Example #3
Source File: ArcHarvestFileDTO.java    From webcurator with Apache License 2.0 5 votes vote down vote up
/**
 * Create and return the index of the ArcHarvestFile.
 * @param baseDir the base directory of the arcs
 * @throws IOException thrown if there is an error
 * @throws ParseException 
 */
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
	Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
	
	File theArchiveFile = new File(baseDir, this.getName());
	ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
	this.compressed = reader.isCompressed();
	
	Iterator<ArchiveRecord> it = reader.iterator();
	while(it.hasNext()) {
		ArchiveRecord rec = it.next();
		
		if(rec instanceof WARCRecord) {
			String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
			if(type.equals(WARCConstants.RESPONSE)) {
				String mime = rec.getHeader().getMimetype();
				if(!mime.equals("text/dns")) {
					indexWARCResponse(rec, results);
				}
			}
		}
		else {
			indexARCRecord(rec, results);
		}
	}
	reader.close();
	
	return results;
}
 
Example #4
Source File: WARCReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void testGetStringInputstreamBoolean() throws IOException {
	// Check the test files can be opened:
	for( String file : files ) {
		FileInputStream is = new FileInputStream(file);
		ArchiveReader ar = WARCReaderFactory.get(file, is, true);
		ArchiveRecord r = ar.get();
		String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
		// Check the first record comes out as a 'warcinfo' record.
		assertEquals(WARCRecordType.warcinfo.name(), type);
	}
}
 
Example #5
Source File: WARCReaderFactory.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected ArchiveReader getArchiveReader(final String f,
		final InputStream is, final boolean atFirstRecord)
		throws IOException {
   	// Check if it's compressed, based on file extension.
   	if( f.endsWith(".gz") ) {
   		return new CompressedWARCReader(f, is, atFirstRecord);
   	} else {
   		return new UncompressedWARCReader(f, is);
   	}
}
 
Example #6
Source File: WARCReaderFactory.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected ArchiveReader getArchiveReader(final File f, final long offset)
   throws IOException {
	boolean compressed = testCompressedWARCFile(f);
	if (!compressed) {
		if (!FileUtils.isReadableWithExtensionAndMagic(f,
				DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
			throw new IOException(f.getAbsolutePath()
					+ " is not a WARC file.");
		}
	}
	return (WARCReader)(compressed?
		WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
		WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
}
 
Example #7
Source File: WordCounterMap.java    From cc-warc-examples with MIT License 5 votes vote down vote up
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	for (ArchiveRecord r : value) {
		try {
			if (r.getHeader().getMimetype().equals("text/plain")) {
				context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
				LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// Grab each word from the document
				tokenizer = new StringTokenizer(content);
				if (!tokenizer.hasMoreTokens()) {
					context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1);
				} else {
					while (tokenizer.hasMoreTokens()) {
						outKey.set(tokenizer.nextToken());
						context.write(outKey, outVal);
					}
				}
			} else {
				context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}
 
Example #8
Source File: TagCounterMap.java    From cc-warc-examples with MIT License 5 votes vote down vote up
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	// Compile the regular expression once as it will be used continuously
	patternTag = Pattern.compile(HTML_TAG_PATTERN);
	
	for (ArchiveRecord r : value) {
		try {
			LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
			// We're only interested in processing the responses, not requests or metadata
			if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// The HTTP header gives us valuable information about what was received during the request
				String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
				
				// In our task, we're only interested in text/html, so we can be a little lax
				// TODO: Proper HTTP header parsing + don't trust headers
				if (headerText.contains("Content-Type: text/html")) {
					context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
					// Only extract the body of the HTTP response when necessary
					// Due to the way strings work in Java, we don't use any more memory than before
					String body = content.substring(content.indexOf("\r\n\r\n") + 4);
					// Process all the matched HTML tags found in the body of the document
					matcherTag = patternTag.matcher(body);
					while (matcherTag.find()) {
						String tagName = matcherTag.group(1);
						outKey.set(tagName.toLowerCase());
						context.write(outKey, outVal);
					}
				}
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}
 
Example #9
Source File: WARCReaderTest.java    From cc-warc-examples with MIT License 5 votes vote down vote up
/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
	// Set up a local compressed WARC file for reading 
	String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	FileInputStream is = new FileInputStream(fn);
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println(r.getHeader());
		System.out.println(r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as the record's stated length
		byte[] rawData = IOUtils.toByteArray(r, r.available());
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
Example #10
Source File: S3ReaderTest.java    From cc-warc-examples with MIT License 5 votes vote down vote up
public static void main(String[] args) throws IOException, S3ServiceException {
	// We're accessing a publicly available bucket so don't need to fill in our credentials
	S3Service s3s = new RestS3Service(null);
	
	// Let's grab a file out of the CommonCrawl S3 bucket
	String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
	
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println("Header: " + r.getHeader());
		System.out.println("URL: " + r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as all the record's stated length
		byte[] rawData = new byte[r.available()];
		r.read(rawData);
		// Note: potential optimization would be to have a large buffer only allocated once
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
Example #11
Source File: WARCFileRecordReader.java    From cc-warc-examples with MIT License 4 votes vote down vote up
@Override
public ArchiveReader getCurrentValue() throws IOException, InterruptedException {
	// We only ever have one value to give -- the output of the compressed file
	return ar;
}
 
Example #12
Source File: WARCFileInputFormat.java    From cc-warc-examples with MIT License 4 votes vote down vote up
@Override
public RecordReader<Text, ArchiveReader> createRecordReader(InputSplit split, TaskAttemptContext context)
		throws IOException, InterruptedException {
	return new WARCFileRecordReader();
}
 
Example #13
Source File: ARCReaderFactory.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
protected ArchiveReader getArchiveReader(final File f, final long offset)
   throws IOException {
   	return getArchiveReader(f, true, offset);
}
 
Example #14
Source File: ARCReaderFactory.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
public static ArchiveReader get(final String s, final InputStream is,
        final boolean atFirstRecord)
throws IOException {
    return ARCReaderFactory.factory.getArchiveReader(s, is,
        atFirstRecord);
}
 
Example #15
Source File: WARCReader.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
@Override
public ArchiveReader getDeleteFileOnCloseReader(final File f) {
    throw new NotImplementedException("TODO");
}
 
Example #16
Source File: WARCReaderFactory.java    From webarchive-commons with Apache License 2.0 4 votes vote down vote up
public static ArchiveReader get(final String s, final InputStream is,
        final boolean atFirstRecord)
throws IOException {
    return WARCReaderFactory.factory.getArchiveReader(s, is,
        atFirstRecord);
}
 
Example #17
Source File: ArcHarvestFileDTO.java    From webcurator with Apache License 2.0 4 votes vote down vote up
public boolean checkIsCompressed() throws IOException {
	ArchiveReader reader = ArchiveReaderFactory.get(new File(baseDir, this.getName()));
	boolean result = reader.isCompressed();
	reader.close();
	return result;
}