org.archive.io.ArchiveRecord Java Examples

The following examples show how to use org.archive.io.ArchiveRecord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WARCReaderFactory.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
public Iterator<ArchiveRecord> iterator() {
    /**
     * Override ArchiveRecordIterator so can base returned iterator on
     * GzippedInputStream iterator.
     */
    return new ArchiveRecordIterator() {
        private GZIPMembersInputStream gis =
            (GZIPMembersInputStream)getIn();

        private Iterator<GZIPMembersInputStream> gzipIterator = this.gis.memberIterator();

        protected boolean innerHasNext() {
            return this.gzipIterator.hasNext();
        }

        protected ArchiveRecord innerNext() throws IOException {
            // Get the position before gzipIterator.next moves
            // it on past the gzip header.
            InputStream is = (InputStream) this.gzipIterator.next();
            return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
        }
    };
}
 
Example #2
Source File: ArchiveReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
/**
 * Test local file as String path
 * @throws IOException
 */
public void testGetPath() throws IOException {
    File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
    ArchiveReader reader = null;
    try {
        reader = ArchiveReaderFactory.get(arc.getAbsoluteFile().getAbsolutePath());
        for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
            ArchiveRecord r = (ArchiveRecord)i.next();
            assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
}
 
Example #3
Source File: ArchiveReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
/**
 * Test local file as File
 * @throws IOException
 */
public void testGetFile() throws IOException {
    File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
    ArchiveReader reader = null;
    try {
        reader = ArchiveReaderFactory.get(arc.getAbsoluteFile());
        for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
            ArchiveRecord r = (ArchiveRecord)i.next();
            assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
}
 
Example #4
Source File: ArchiveReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
/**
 * Test local file as URL
 * @throws IOException
 */
public void testGetFileURL() throws IOException {
    File arc = ARCWriterTest.createARCFile(getTmpDir(), true);
    ArchiveReader reader = null;
    try {
        reader = ArchiveReaderFactory.
            get(new URL("file:////" + arc.getAbsolutePath()));
        for (Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
            ArchiveRecord r = (ArchiveRecord)i.next();
            assertTrue("mime unread",StringUtils.isNotBlank(r.getHeader().getMimetype()));
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
}
 
Example #5
Source File: WARCWriterTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
public void testArcRecordOffsetReads() throws Exception {
   	// Get an ARC with one record.
	WriterPoolMember w =
		createWithOneRecord("testArcRecordInBufferStream", true);
	w.close();
	// Get reader on said ARC.
	WARCReader r = WARCReaderFactory.get(w.getFile());
	final Iterator<ArchiveRecord> i = r.iterator();
	// Skip first ARC meta record.
	ArchiveRecord ar = i.next();
	i.hasNext();
	// Now we're at first and only record in ARC.
	ar = (WARCRecord) i.next();
	// Now try getting some random set of bytes out of it 
	// at an odd offset (used to fail because we were
	// doing bad math to find where in buffer to read).
	final byte[] buffer = new byte[17];
	final int maxRead = 4;
	int totalRead = 0;
	while (totalRead < maxRead) {
		totalRead = totalRead
		    + ar.read(buffer, 13 + totalRead, maxRead - totalRead);
		assertTrue(totalRead > 0);
	}
}
 
Example #6
Source File: ARCReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
  	RandomAccessFile raf = new RandomAccessFile(testfile, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = testfile.getAbsolutePath();
ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);    	
// This one works:
//ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
ArchiveRecord record = reader.get();

final String url = record.getHeader().getUrl();
assertEquals("URL of record is not as expected.", uri, url);

      final long position = record.getPosition();
      final long recordLength = record.getHeader().getLength();
      assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);

      // Clean up:
      if( raf != null )
      	raf.close();
  }
 
Example #7
Source File: ARCReaderFactory.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
protected void gotoEOR(ArchiveRecord rec) throws IOException {
    int c;
    while ((c = getIn().read())==LINE_SEPARATOR);
    if(c==-1) {
        return; 
    }
    long skipped = 1; 
    while (getIn().read()>-1) {
        skipped++;
    }
    // Report on system error the number of unexpected characters
    // at the end of this record.
    ArchiveRecordHeader meta = (getCurrentRecord() != null)?
        rec.getHeader(): null;
    String message = "Record STARTING at " +
        ((GZIPMembersInputStream)getIn()).getCurrentMemberStart() +
        " has " + skipped + " trailing byte(s): " +
        ((meta != null)? meta.toString(): "");
    if (isStrict()) {
        throw new IOException(message);
    }
    logStdErr(Level.WARNING, message);
}
 
Example #8
Source File: ARCReaderFactory.java    From webarchive-commons with Apache License 2.0 6 votes vote down vote up
public Iterator<ArchiveRecord> iterator() {
    /**
     * Override ARCRecordIterator so can base returned iterator on
     * GzippedInputStream iterator.
     */
    return new ArchiveRecordIterator() {
        private GZIPMembersInputStream gis =
            (GZIPMembersInputStream)getIn();

        private Iterator<GZIPMembersInputStream> gzipIterator = this.gis.memberIterator();

        protected boolean innerHasNext() {
            return this.gzipIterator.hasNext();
        }

        protected ArchiveRecord innerNext() throws IOException {
            InputStream is = this.gzipIterator.next();
            return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
        }
    };
}
 
Example #9
Source File: ARCReader.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void dump(final boolean compress)
throws IOException, java.text.ParseException {
    // No point digesting if we're doing a dump.
    setDigest(false);
    boolean firstRecord = true;
    ARCWriter writer = null;
    for (Iterator<ArchiveRecord> ii = iterator(); ii.hasNext();) {
        ARCRecord r = (ARCRecord)ii.next();
        // We're to dump the arc on stdout.
        // Get the first record's data if any.
        ARCRecordMetaData meta = r.getMetaData();
        if (firstRecord) {
            firstRecord = false;
            // Get an ARCWriter.
            ByteArrayOutputStream baos =
                new ByteArrayOutputStream(r.available());
            // This is slow but done only once at top of ARC.
            while (r.available() > 0) {
                baos.write(r.read());
            }
            List<String> listOfMetadata = new ArrayList<String>();
            listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
            // Assume getArc returns full path to file.  ARCWriter
            // or new File will complain if it is otherwise.
            List<File> outDirs = new ArrayList<File>(); 
            WriterPoolSettingsData settings = 
                new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata); 
            writer = new ARCWriter(new AtomicInteger(), System.out,
                new File(meta.getArc()), settings);
            continue;
        }
        
        writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
            ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
            (int)meta.getLength(), r);
    }
    // System.out.println(System.currentTimeMillis() - start);
}
 
Example #10
Source File: ARCReader.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
/**
 * Skip over any trailing new lines at end of the record so we're lined up
 * ready to read the next.
 * @param record
 * @throws IOException
 */
protected void gotoEOR(ArchiveRecord record) throws IOException {
    if (getIn().available() <= 0) {
        return;
    }
    
    // Remove any trailing LINE_SEPARATOR
    int c = -1;
    while (getIn().available() > 0) {
        if (getIn().markSupported()) {
            getIn().mark(1);
        }
        c = getIn().read();
        if (c != -1) {
            if (c == LINE_SEPARATOR) {
                continue;
            }
            if (getIn().markSupported()) {
                // We've overread.  We're probably in next record.  There is
                // no way of telling for sure. It may be dross at end of
                // current record. Backup.
                    getIn().reset();
                break;
            }
            ArchiveRecordHeader h = (getCurrentRecord() != null)?
                record.getHeader(): null;
            throw new IOException("Read " + (char)c +
                " when only " + LINE_SEPARATOR + " expected. " + 
                getReaderIdentifier() + ((h != null)?
                    h.getHeaderFields().toString(): ""));
        }
    }
}
 
Example #11
Source File: WARCReader.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
/**
 * Skip over any trailing new lines at end of the record so we're lined up
 * ready to read the next.
 * @param record
 * @throws IOException
 */
protected void gotoEOR(ArchiveRecord record) throws IOException {
    if (record.available() != 0) {
        throw new IOException("Record should be exhausted before coming " +
            "in here");
    }

    // Records end in 2*CRLF.  Suck it up.
    readExpectedChar(getIn(), CRLF.charAt(0));
    readExpectedChar(getIn(), CRLF.charAt(1));
    readExpectedChar(getIn(), CRLF.charAt(0));
    readExpectedChar(getIn(), CRLF.charAt(1));
}
 
Example #12
Source File: WARCReader.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
@Override
public void dump(boolean compress)
throws IOException, java.text.ParseException {
    for (final Iterator<ArchiveRecord> i = iterator(); i.hasNext();) {
           ArchiveRecord r = i.next();
           System.out.println(r.getHeader().toString());
           r.dump();
           System.out.println();
       }
}
 
Example #13
Source File: WARCReaderFactory.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected void gotoEOR(ArchiveRecord rec) throws IOException {
    long skipped = 0; 
    while (getIn().read()>-1) {
        skipped++;
    }
    if(skipped>4) {
        System.err.println("unexpected extra data after record "+rec);
    }
    return;
}
 
Example #14
Source File: ARCWriterTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected int iterateRecords(ARCReader r)
throws IOException {
    int count = 0;
    for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
        ARCRecord rec = (ARCRecord)i.next();
        rec.close();
        if (count != 0) {
            assertTrue("Unexpected URL " + rec.getMetaData().getUrl(),
                rec.getMetaData().getUrl().startsWith(SOME_URL));
        }
        count++;
    }
    return count;
}
 
Example #15
Source File: ARCWriterTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
/**
 * Test a particular style of using the reader iterator. (Should
 * possibly be on a reader-centric test class, but the best setup 
 * functionality is here.)
 * 
 * @throws IOException
 */
public void testReadIterator() throws IOException {
    final int recordCount = 3;
    File arcFile = writeRecords("writeRecord", true,
        DEFAULT_MAX_ARC_FILE_SIZE, recordCount);
    ARCReader reader = ARCReaderFactory.get(arcFile);
    Iterator<ArchiveRecord> it = reader.iterator();
    while (it.hasNext()) {
        ArchiveRecord next = it.next();
        next.close();
    }
    reader.close(); 
}
 
Example #16
Source File: ARCWriterTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected ARCRecord getSingleRecord(ARCReader r) {
    final Iterator<ArchiveRecord> i = r.iterator();
    // Skip first ARC meta record.
    i.next();
    i.hasNext();
    // Now we're at first and only record in ARC.
    return (ARCRecord) i.next();
}
 
Example #17
Source File: WordCounterMap.java    From cc-warc-examples with MIT License 5 votes vote down vote up
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	for (ArchiveRecord r : value) {
		try {
			if (r.getHeader().getMimetype().equals("text/plain")) {
				context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
				LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// Grab each word from the document
				tokenizer = new StringTokenizer(content);
				if (!tokenizer.hasMoreTokens()) {
					context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1);
				} else {
					while (tokenizer.hasMoreTokens()) {
						outKey.set(tokenizer.nextToken());
						context.write(outKey, outVal);
					}
				}
			} else {
				context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}
 
Example #18
Source File: WARCReaderFactoryTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
public void testGetStringInputstreamBoolean() throws IOException {
	// Check the test files can be opened:
	for( String file : files ) {
		FileInputStream is = new FileInputStream(file);
		ArchiveReader ar = WARCReaderFactory.get(file, is, true);
		ArchiveRecord r = ar.get();
		String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
		// Check the first record comes out as a 'warcinfo' record.
		assertEquals(WARCRecordType.warcinfo.name(), type);
	}
}
 
Example #19
Source File: WARCWriterTest.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
protected int iterateRecords(WARCReader r)
throws IOException {
    int count = 0;
    for (Iterator<ArchiveRecord> i = r.iterator(); i.hasNext();) {
        ArchiveRecord ar = i.next();
        ar.close();
        if (count != 0) {
            assertTrue("Unexpected URL " + ar.getHeader().getUrl(),
                ar.getHeader().getUrl().equals(SOME_URL));
        }
        count++;
    }
    return count;
}
 
Example #20
Source File: TagCounterMap.java    From cc-warc-examples with MIT License 5 votes vote down vote up
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	// Compile the regular expression once as it will be used continuously
	patternTag = Pattern.compile(HTML_TAG_PATTERN);
	
	for (ArchiveRecord r : value) {
		try {
			LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
			// We're only interested in processing the responses, not requests or metadata
			if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// The HTTP header gives us valuable information about what was received during the request
				String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
				
				// In our task, we're only interested in text/html, so we can be a little lax
				// TODO: Proper HTTP header parsing + don't trust headers
				if (headerText.contains("Content-Type: text/html")) {
					context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
					// Only extract the body of the HTTP response when necessary
					// Due to the way strings work in Java, we don't use any more memory than before
					String body = content.substring(content.indexOf("\r\n\r\n") + 4);
					// Process all the matched HTML tags found in the body of the document
					matcherTag = patternTag.matcher(body);
					while (matcherTag.find()) {
						String tagName = matcherTag.group(1);
						outKey.set(tagName.toLowerCase());
						context.write(outKey, outVal);
					}
				}
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}
 
Example #21
Source File: WARCReaderTest.java    From cc-warc-examples with MIT License 5 votes vote down vote up
/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
	// Set up a local compressed WARC file for reading 
	String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	FileInputStream is = new FileInputStream(fn);
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println(r.getHeader());
		System.out.println(r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as the record's stated length
		byte[] rawData = IOUtils.toByteArray(r, r.available());
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
Example #22
Source File: S3ReaderTest.java    From cc-warc-examples with MIT License 5 votes vote down vote up
public static void main(String[] args) throws IOException, S3ServiceException {
	// We're accessing a publicly available bucket so don't need to fill in our credentials
	S3Service s3s = new RestS3Service(null);
	
	// Let's grab a file out of the CommonCrawl S3 bucket
	String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
	
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println("Header: " + r.getHeader());
		System.out.println("URL: " + r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as all the record's stated length
		byte[] rawData = new byte[r.available()];
		r.read(rawData);
		// Note: potential optimization would be to have a large buffer only allocated once
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
Example #23
Source File: ArcHarvestFileDTO.java    From webcurator with Apache License 2.0 5 votes vote down vote up
/**
 * Create and return the index of the ArcHarvestFile.
 * @param baseDir the base directory of the arcs
 * @throws IOException thrown if there is an error
 * @throws ParseException 
 */
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
	Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
	
	File theArchiveFile = new File(baseDir, this.getName());
	ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
	this.compressed = reader.isCompressed();
	
	Iterator<ArchiveRecord> it = reader.iterator();
	while(it.hasNext()) {
		ArchiveRecord rec = it.next();
		
		if(rec instanceof WARCRecord) {
			String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
			if(type.equals(WARCConstants.RESPONSE)) {
				String mime = rec.getHeader().getMimetype();
				if(!mime.equals("text/dns")) {
					indexWARCResponse(rec, results);
				}
			}
		}
		else {
			indexARCRecord(rec, results);
		}
	}
	reader.close();
	
	return results;
}
 
Example #24
Source File: WarcTargetRepositoryTest.java    From ache with Apache License 2.0 4 votes vote down vote up
@Test
public void testReadingMultipleWarcRecords() throws Exception {
    String folder = tempFolder.newFolder().toString();

    String url1 = "http://a.com";
    String url2 = "http://b.com";

    Page target1 = new Page(new URL(url1), html, responseHeaders);
    target1.setFetchTime(System.currentTimeMillis());

    Page target2 = new Page(new URL(url2), html, responseHeaders);
    target2.setFetchTime(System.currentTimeMillis());

    WarcTargetRepository repository = new WarcTargetRepository(folder);

    // when
    repository.insert(target1);
    repository.insert(target2);
    repository.close();

    WARCWriter writer = repository.getWriter();
    WARCReader reader = WARCReaderFactory.get(writer.getFile());

    // Get to second record. Get its offset for later use.
    boolean readWarcInfoRecord = false;
    boolean readFirst = false;
    boolean readSecond = false;

    for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
        WARCRecord ar = (WARCRecord) i.next();
        if (!readWarcInfoRecord) {
            readWarcInfoRecord = true;
        } else if (!readFirst) {
            readFirst = true;
            assertThat(ar.getHeader().getUrl(), is(url1));
            continue;
        } else if (!readSecond) {
            url = ar.getHeader().getUrl();
            assertThat(ar.getHeader().getUrl(), is(url2));
            readSecond = true;
        }
    }
    reader.close();
}