Java Code Examples for org.apache.nutch.crawl.CrawlDatum#STATUS_DB_FETCHED

The following examples show how to use org.apache.nutch.crawl.CrawlDatum#STATUS_DB_FETCHED . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DeduplicationJob.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
@Override
public void map(Text key, CrawlDatum value,
        OutputCollector<BytesWritable, CrawlDatum> output,
        Reporter reporter) throws IOException {

    if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED
            || value.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
        // || value.getStatus() ==CrawlDatum.STATUS_DB_GONE){
        byte[] signature = value.getSignature();
        if (signature == null) return;
        BytesWritable sig = new BytesWritable(signature);
        // add the URL as a temporary MD
        value.getMetaData().put(urlKey, key);
        // reduce on the signature
        output.collect(sig, value);
    }
}
 
Example 2
Source File: DomainStatistics.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException {

      if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {

        try {
          URL url = new URL(urlText.toString());
          String out = null;
          switch (mode) {
            case MODE_HOST:
              out = url.getHost();
              break;
            case MODE_DOMAIN:
              out = URLUtil.getDomainName(url);
              break;
            case MODE_SUFFIX:
              out = URLUtil.getDomainSuffix(url).getDomain();
              break;
            case MODE_TLD:
              out = URLUtil.getTopLevelDomainName(url);
              break;
          }
          if(out.trim().equals("")) {
            LOG.info("url : " + url);
            context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
          }

          context.write(new Text(out), new LongWritable(1));
        } catch (Exception ex) { }

        context.getCounter(MyCounter.FETCHED).increment(1);
        context.write(FETCHED_TEXT, new LongWritable(1));
      }
      else {
        context.getCounter(MyCounter.NOT_FETCHED).increment(1);
        context.write(NOT_FETCHED_TEXT, new LongWritable(1));
      }
    }
 
Example 3
Source File: DomainStatistics.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException {

      if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {

        try {
          URL url = new URL(urlText.toString());
          String out = null;
          switch (mode) {
            case MODE_HOST:
              out = url.getHost();
              break;
            case MODE_DOMAIN:
              out = URLUtil.getDomainName(url);
              break;
            case MODE_SUFFIX:
              out = URLUtil.getDomainSuffix(url).getDomain();
              break;
            case MODE_TLD:
              out = URLUtil.getTopLevelDomainName(url);
              break;
          }
          if(out.trim().equals("")) {
            LOG.info("url : " + url);
            context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
          }

          context.write(new Text(out), new LongWritable(1));
        } catch (Exception ex) { }

        context.getCounter(MyCounter.FETCHED).increment(1);
        context.write(FETCHED_TEXT, new LongWritable(1));
      }
      else {
        context.getCounter(MyCounter.NOT_FETCHED).increment(1);
        context.write(NOT_FETCHED_TEXT, new LongWritable(1));
      }
    }
 
Example 4
Source File: ArcSegmentCreator.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * <p>Runs the Map job to translate an arc record into output for Nutch 
 * segments.</p>
 * 
 * @param key The arc record header.
 * @param bytes The arc record raw content bytes.
 * @param output The output collecter.
 * @param reporter The progress reporter.
 */
public void map(Text key, BytesWritable bytes,
  OutputCollector<Text, NutchWritable> output, Reporter reporter)
  throws IOException {

  String[] headers = key.toString().split("\\s+");
  String urlStr = headers[0];
  String version = headers[2];
  String contentType = headers[3];
  
  // arcs start with a file description.  for now we ignore this as it is not
  // a content record
  if (urlStr.startsWith("filedesc://")) {
    LOG.info("Ignoring file header: " + urlStr);
    return;
  }
  LOG.info("Processing: " + urlStr);

  // get the raw  bytes from the arc file, create a new crawldatum
  Text url = new Text();
  CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
    1.0f);
  String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);

  // normalize and filter the urls
  try {
    urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
    urlStr = urlFilters.filter(urlStr); // filter the url
  }
  catch (Exception e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("Skipping " + url + ":" + e);
    }
    urlStr = null;
  }

  // if still a good url then process
  if (urlStr != null) {

    url.set(urlStr);
    try {

      // set the protocol status to success and the crawl status to success
      // create the content from the normalized url and the raw bytes from
      // the arc file,  TODO: currently this doesn't handle text of errors
      // pages (i.e. 404, etc.). We assume we won't get those.
      ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
      Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
        new Metadata(), getConf());
      
      // set the url version into the metadata
      content.getMetadata().set(URL_VERSION, version);
      ParseStatus pstatus = null;
      pstatus = output(output, segmentName, url, datum, content, status,
        CrawlDatum.STATUS_FETCH_SUCCESS);
      reporter.progress();
    }
    catch (Throwable t) { // unexpected exception
      logError(url, t);
      output(output, segmentName, url, datum, null, null,
        CrawlDatum.STATUS_FETCH_RETRY);
    }
  }
}
 
Example 5
Source File: ArcSegmentCreator.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * <p>Runs the Map job to translate an arc record into output for Nutch 
 * segments.</p>
 * 
 * @param key The arc record header.
 * @param bytes The arc record raw content bytes.
 * @param output The output collecter.
 * @param reporter The progress reporter.
 */
public void map(Text key, BytesWritable bytes,
  OutputCollector<Text, NutchWritable> output, Reporter reporter)
  throws IOException {

  String[] headers = key.toString().split("\\s+");
  String urlStr = headers[0];
  String version = headers[2];
  String contentType = headers[3];
  
  // arcs start with a file description.  for now we ignore this as it is not
  // a content record
  if (urlStr.startsWith("filedesc://")) {
    LOG.info("Ignoring file header: " + urlStr);
    return;
  }
  LOG.info("Processing: " + urlStr);

  // get the raw  bytes from the arc file, create a new crawldatum
  Text url = new Text();
  CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
    1.0f);
  String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);

  // normalize and filter the urls
  try {
    urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
    urlStr = urlFilters.filter(urlStr); // filter the url
  }
  catch (Exception e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("Skipping " + url + ":" + e);
    }
    urlStr = null;
  }

  // if still a good url then process
  if (urlStr != null) {

    url.set(urlStr);
    try {

      // set the protocol status to success and the crawl status to success
      // create the content from the normalized url and the raw bytes from
      // the arc file,  TODO: currently this doesn't handle text of errors
      // pages (i.e. 404, etc.). We assume we won't get those.
      ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
      Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
        new Metadata(), getConf());
      
      // set the url version into the metadata
      content.getMetadata().set(URL_VERSION, version);
      ParseStatus pstatus = null;
      pstatus = output(output, segmentName, url, datum, content, status,
        CrawlDatum.STATUS_FETCH_SUCCESS);
      reporter.progress();
    }
    catch (Throwable t) { // unexpected exception
      logError(url, t);
      output(output, segmentName, url, datum, null, null,
        CrawlDatum.STATUS_FETCH_RETRY);
    }
  }
}