org.archive.io.warc.WARCReaderFactory Java Examples

The following examples show how to use org.archive.io.warc.WARCReaderFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: S3ReaderTest.java    From cc-warc-examples with MIT License 5 votes vote down vote up
public static void main(String[] args) throws IOException, S3ServiceException {
	// We're accessing a publicly available bucket so don't need to fill in our credentials
	S3Service s3s = new RestS3Service(null);
	
	// Let's grab a file out of the CommonCrawl S3 bucket
	String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
	
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println("Header: " + r.getHeader());
		System.out.println("URL: " + r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as all the record's stated length
		byte[] rawData = new byte[r.available()];
		r.read(rawData);
		// Note: potential optimization would be to have a large buffer only allocated once
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
Example #2
Source File: WARCReaderTest.java    From cc-warc-examples with MIT License 5 votes vote down vote up
/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
	// Set up a local compressed WARC file for reading 
	String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	FileInputStream is = new FileInputStream(fn);
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println(r.getHeader());
		System.out.println(r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as the record's stated length
		byte[] rawData = IOUtils.toByteArray(r, r.available());
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
Example #3
Source File: WARCFileRecordReader.java    From cc-warc-examples with MIT License 5 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
		throws IOException, InterruptedException {
	FileSplit split = (FileSplit) inputSplit;
	Configuration conf = context.getConfiguration();
	Path path = split.getPath();
	FileSystem fs = path.getFileSystem(conf);
	fsin = fs.open(path);
	arPath = path.getName();
	ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
 
Example #4
Source File: ArchiveReaderFactory.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected ArchiveReader getArchiveReader(final File f,
	final long offset)
throws IOException {
	if (ARCReaderFactory.isARCSuffix(f.getName())) {
		return ARCReaderFactory.get(f, true, offset);
	} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
		return WARCReaderFactory.get(f, offset);
	}
	throw new IOException("Unknown file extension (Not ARC nor WARC): "
		+ f.getName());
}
 
Example #5
Source File: ArchiveReaderFactory.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected ArchiveReader getArchiveReader(final String id, 
		final InputStream is, final boolean atFirstRecord)
throws IOException {
	final InputStream stream = is; 
    if (ARCReaderFactory.isARCSuffix(id)) {
        return ARCReaderFactory.get(id, stream, atFirstRecord);
    } else if (WARCReaderFactory.isWARCSuffix(id)) {
        return WARCReaderFactory.get(id, stream, atFirstRecord);
    }
    throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
}
 
Example #6
Source File: WarcTargetRepository.java    From ache with Apache License 2.0 4 votes vote down vote up
private WARCReader openFile(Path filePath) throws IOException {
    return WARCReaderFactory.get(filePath.toFile());
}
 
Example #7
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 4 votes vote down vote up
@Test
public void testReadingMultipleWarcRecords() throws Exception {
    String folder = tempFolder.newFolder().toString();

    String url1 = "http://a.com";
    String url2 = "http://b.com";

    Page target1 = new Page(new URL(url1), html, responseHeaders);
    target1.setFetchTime(System.currentTimeMillis());

    Page target2 = new Page(new URL(url2), html, responseHeaders);
    target2.setFetchTime(System.currentTimeMillis());

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target1);
    repository.insert(target2);
    repository.close();

    WARCWriter writer = repository.getWriter();
    WARCReader reader = WARCReaderFactory.get(writer.getFile());

    // Get to second record. Get its offset for later use.
    boolean readWarcInfoRecord = false;
    boolean readFirst = false;
    boolean readSecond = false;

    for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
        WARCRecord ar = (WARCRecord) i.next();
        if (!readWarcInfoRecord) {
            readWarcInfoRecord = true;
        } else if (!readFirst) {
            readFirst = true;
            assertThat(ar.getHeader().getUrl(), is(url1));
            continue;
        } else if (!readSecond) {
            url = ar.getHeader().getUrl();
            assertThat(ar.getHeader().getUrl(), is(url2));
            readSecond = true;
        }
    }
    reader.close();
}