Java Code Examples for org.apache.nutch.crawl.CrawlDatum#getStatus()

The following examples show how to use org.apache.nutch.crawl.CrawlDatum#getStatus() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: DeduplicationJob.java From nutch-htmlunit with Apache License 2.0

6 votes

@Override
public void map(Text key, CrawlDatum value,
        OutputCollector<BytesWritable, CrawlDatum> output,
        Reporter reporter) throws IOException {

    if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED
            || value.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
        // || value.getStatus() ==CrawlDatum.STATUS_DB_GONE){
        byte[] signature = value.getSignature();
        if (signature == null) return;
        BytesWritable sig = new BytesWritable(signature);
        // add the URL as a temporary MD
        value.getMetaData().put(urlKey, key);
        // reduce on the signature
        output.collect(sig, value);
    }
}

Example 2

Source File: DeduplicationJob.java From nutch-htmlunit with Apache License 2.0

6 votes

public void reduce(Text key, Iterator<CrawlDatum> values,
        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
        throws IOException {
    boolean duplicateSet = false;
    
    while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
            duplicate.set(val);
            duplicateSet = true;
        } else {
            old.set(val);
        }
    }

    // keep the duplicate if there is one
    if (duplicateSet) {
        output.collect(key, duplicate);
        return;
    }

    // no duplicate? keep old one then
    output.collect(key, old);
}

Example 3

Source File: DomainStatistics.java From anthelion with Apache License 2.0

5 votes

public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException {

      if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {

        try {
          URL url = new URL(urlText.toString());
          String out = null;
          switch (mode) {
            case MODE_HOST:
              out = url.getHost();
              break;
            case MODE_DOMAIN:
              out = URLUtil.getDomainName(url);
              break;
            case MODE_SUFFIX:
              out = URLUtil.getDomainSuffix(url).getDomain();
              break;
            case MODE_TLD:
              out = URLUtil.getTopLevelDomainName(url);
              break;
          }
          if(out.trim().equals("")) {
            LOG.info("url : " + url);
            context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
          }

          context.write(new Text(out), new LongWritable(1));
        } catch (Exception ex) { }

        context.getCounter(MyCounter.FETCHED).increment(1);
        context.write(FETCHED_TEXT, new LongWritable(1));
      }
      else {
        context.getCounter(MyCounter.NOT_FETCHED).increment(1);
        context.write(NOT_FETCHED_TEXT, new LongWritable(1));
      }
    }

Example 4

Source File: SolrClean.java From anthelion with Apache License 2.0

5 votes

@Override
public void map(Text key, CrawlDatum value,
    OutputCollector<ByteWritable, Text> output, Reporter reporter)
    throws IOException {

  if (value.getStatus() == CrawlDatum.STATUS_DB_GONE) {
    output.collect(OUT, key);
  }
}

Example 5

Source File: DomainStatistics.java From nutch-htmlunit with Apache License 2.0

5 votes

public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException {

      if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {

        try {
          URL url = new URL(urlText.toString());
          String out = null;
          switch (mode) {
            case MODE_HOST:
              out = url.getHost();
              break;
            case MODE_DOMAIN:
              out = URLUtil.getDomainName(url);
              break;
            case MODE_SUFFIX:
              out = URLUtil.getDomainSuffix(url).getDomain();
              break;
            case MODE_TLD:
              out = URLUtil.getTopLevelDomainName(url);
              break;
          }
          if(out.trim().equals("")) {
            LOG.info("url : " + url);
            context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
          }

          context.write(new Text(out), new LongWritable(1));
        } catch (Exception ex) { }

        context.getCounter(MyCounter.FETCHED).increment(1);
        context.write(FETCHED_TEXT, new LongWritable(1));
      }
      else {
        context.getCounter(MyCounter.NOT_FETCHED).increment(1);
        context.write(NOT_FETCHED_TEXT, new LongWritable(1));
      }
    }

Example 6

Source File: CleaningJob.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public void map(Text key, CrawlDatum value,
        OutputCollector<ByteWritable, Text> output, Reporter reporter)
        throws IOException {

    if (value.getStatus() == CrawlDatum.STATUS_DB_GONE || value.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
        output.collect(OUT, key);
    }
}

Example 7

Source File: TestSegmentMergerCrawlDatums.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Checks the merged segment and removes the stuff again.
 *
 * @param the test directory
 * @param the merged segment
 * @return the final status
 */
protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception  {
  // Get a MapFile reader for the <Text,CrawlDatum> pairs
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
  
  Text key = new Text();
  CrawlDatum value = new CrawlDatum();
  byte finalStatus = 0x0;
  
  for (MapFile.Reader reader : readers) {
    while (reader.next(key, value)) {
      LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus()));
      
      // Only consider fetch status
      if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
        finalStatus = value.getStatus();
      }
    }
    
    // Close the reader again
    reader.close();
  }

  // Remove the test directory again
  fs.delete(testDir, true);
  
  LOG.info("Final fetch status for: http://nutch.apache.org/ > " + CrawlDatum.getStatusName(finalStatus));

  // Return the final status
  return finalStatus;
}