Java Code Examples for org.apache.nutch.crawl.CrawlDatum#setFetchTime()

The following examples show how to use org.apache.nutch.crawl.CrawlDatum#setFetchTime() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestBasicIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testBasicIndexingFilter() throws Exception { 
  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  conf.setBoolean("indexer.add.domain", true);
  conf.setInt("indexer.max.content.length", 20);

  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);

  NutchDocument doc = new NutchDocument();

  String title = "The Foo Page";
  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
  ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

  CrawlDatum crawlDatum = new CrawlDatum();
  crawlDatum.setFetchTime(100L);

  Inlinks inlinks = new Inlinks();

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
  assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
  assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
  assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
    doc.getField("url").getValues().get(0));
  assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
  assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
}
 
Example 2
Source File: MimeAdaptiveFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new MimeAdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;

  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);

  // Set a default MIME-type to test with
  org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
  x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
  p.setMetaData(x);

  p.setFetchTime(0);
  LOG.info(p.toString());

  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }

    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);

    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);

      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");

      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }

    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 3
Source File: AdaptiveFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new AdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;
  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
  p.setFetchTime(0);
  LOG.info(p.toString());
  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }
    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }
    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 4
Source File: AdaptiveFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
@Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
        long prevFetchTime, long prevModifiedTime,
        long fetchTime, long modifiedTime, int state) {
  super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
      fetchTime, modifiedTime, state);

  float interval = datum.getFetchInterval();
  long refTime = fetchTime;

  // https://issues.apache.org/jira/browse/NUTCH-1430
  interval = (interval == 0) ? defaultInterval : interval;

  if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
    // Is fetch interval preset in CrawlDatum MD? Then use preset interval
    FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
    interval = customIntervalWritable.get();
  } else {
    if (modifiedTime <= 0) modifiedTime = fetchTime;
    switch (state) {
      case FetchSchedule.STATUS_MODIFIED:
        interval *= (1.0f - DEC_RATE);
        break;
      case FetchSchedule.STATUS_NOTMODIFIED:
        interval *= (1.0f + INC_RATE);
        break;
      case FetchSchedule.STATUS_UNKNOWN:
        break;
    }
    if (SYNC_DELTA) {
      // try to synchronize with the time of change
      long delta = (fetchTime - modifiedTime) / 1000L;
      if (delta > interval) interval = delta;
      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
    }
    if (interval < MIN_INTERVAL) {
      interval = MIN_INTERVAL;
    } else if (interval > MAX_INTERVAL) {
      interval = MAX_INTERVAL;
    }
  }

  datum.setFetchInterval(interval);
  datum.setFetchTime(refTime + Math.round(interval * 1000.0));
  datum.setModifiedTime(modifiedTime);
  return datum;
}
 
Example 5
Source File: AbstractFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * This method provides information whether the page is suitable for
 * selection in the current fetchlist. NOTE: a true return value does not
 * guarantee that the page will be fetched, it just allows it to be
 * included in the further selection process based on scores. The default
 * implementation checks <code>fetchTime</code>, if it is higher than the
 * <code>curTime</code> it returns false, and true otherwise. It will also
 * check that fetchTime is not too remote (more than <code>maxInterval</code>,
 * in which case it lowers the interval and returns true.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance.
 *
 * @param curTime reference time (usually set to the time when the
 * fetchlist generation process was started).
 *
 * @return true, if the page should be considered for inclusion in the current
 * fetchlist, otherwise false.
 */
public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
  // pages are never truly GONE - we have to check them from time to time.
  // pages with too long fetchInterval are adjusted so that they fit within
  // maximum fetchInterval (segment retention period).
  if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
    if (datum.getFetchInterval() > maxInterval) {
      datum.setFetchInterval(maxInterval * 0.9f);
    }
    datum.setFetchTime(curTime);
  }
  if (datum.getFetchTime() > curTime) {
    return false;                                   // not time yet
  }
  return true;
}
 
Example 6
Source File: MimeAdaptiveFetchSchedule.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new MimeAdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;

  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);

  // Set a default MIME-type to test with
  org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
  x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
  p.setMetaData(x);

  p.setFetchTime(0);
  LOG.info(p.toString());

  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }

    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);

    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);

      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");

      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }

    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 7
Source File: AdaptiveFetchSchedule.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new AdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;
  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
  p.setFetchTime(0);
  LOG.info(p.toString());
  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }
    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }
    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 8
Source File: AdaptiveFetchSchedule.java    From anthelion with Apache License 2.0 4 votes vote down vote up
@Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
        long prevFetchTime, long prevModifiedTime,
        long fetchTime, long modifiedTime, int state) {
  super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
      fetchTime, modifiedTime, state);

  float interval = datum.getFetchInterval();
  long refTime = fetchTime;

  if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
    // Is fetch interval preset in CrawlDatum MD? Then use preset interval
    FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
    interval = customIntervalWritable.get();
  } else {
    if (modifiedTime <= 0) modifiedTime = fetchTime;
    switch (state) {
      case FetchSchedule.STATUS_MODIFIED:
        interval *= (1.0f - DEC_RATE);
        break;
      case FetchSchedule.STATUS_NOTMODIFIED:
        interval *= (1.0f + INC_RATE);
        break;
      case FetchSchedule.STATUS_UNKNOWN:
        break;
    }
    if (SYNC_DELTA) {
      // try to synchronize with the time of change
      long delta = (fetchTime - modifiedTime) / 1000L;
      if (delta > interval) interval = delta;
      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
    }
    if (interval < MIN_INTERVAL) {
      interval = MIN_INTERVAL;
    } else if (interval > MAX_INTERVAL) {
      interval = MAX_INTERVAL;
    }
  }

  datum.setFetchInterval(interval);
  datum.setFetchTime(refTime + Math.round(interval * 1000.0));
  datum.setModifiedTime(modifiedTime);
  return datum;
}
 
Example 9
Source File: AbstractFetchSchedule.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * This method provides information whether the page is suitable for
 * selection in the current fetchlist. NOTE: a true return value does not
 * guarantee that the page will be fetched, it just allows it to be
 * included in the further selection process based on scores. The default
 * implementation checks <code>fetchTime</code>, if it is higher than the
 * <code>curTime</code> it returns false, and true otherwise. It will also
 * check that fetchTime is not too remote (more than <code>maxInterval</code>,
 * in which case it lowers the interval and returns true.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance.
 *
 * @param curTime reference time (usually set to the time when the
 * fetchlist generation process was started).
 *
 * @return true, if the page should be considered for inclusion in the current
 * fetchlist, otherwise false.
 */
public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
  // pages are never truly GONE - we have to check them from time to time.
  // pages with too long fetchInterval are adjusted so that they fit within
  // maximum fetchInterval (segment retention period).
  if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
    if (datum.getFetchInterval() > maxInterval) {
      datum.setFetchInterval(maxInterval * 0.9f);
    }
    datum.setFetchTime(curTime);
  }
  if (datum.getFetchTime() > curTime) {
    return false;                                   // not time yet
  }
  return true;
}
 
Example 10
Source File: AbstractFetchSchedule.java    From anthelion with Apache License 2.0 3 votes vote down vote up
/**
 * This method resets fetchTime, fetchInterval, modifiedTime,
 * retriesSinceFetch and page signature, so that it forces refetching.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance.
 *
 * @param asap if true, force refetch as soon as possible - this sets
 * the fetchTime to now. If false, force refetch whenever the next fetch
 * time is set.
 */
public CrawlDatum  forceRefetch(Text url, CrawlDatum datum, boolean asap) {
  // reduce fetchInterval so that it fits within the max value
  if (datum.getFetchInterval() > maxInterval)
    datum.setFetchInterval(maxInterval * 0.9f);
  datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
  datum.setRetriesSinceFetch(0);
  datum.setSignature(null);
  datum.setModifiedTime(0L);
  if (asap) datum.setFetchTime(System.currentTimeMillis());
  return datum;
}
 
Example 11
Source File: AbstractFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 3 votes vote down vote up
/**
 * Initialize fetch schedule related data. Implementations should at least
 * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
 * implementation sets the <code>fetchTime</code> to now, using the
 * default <code>fetchInterval</code>.
 * 
 * @param url URL of the page.
 *
 * @param datum datum instance to be initialized (modified in place).
 */
public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
  datum.setFetchTime(System.currentTimeMillis());
  datum.setFetchInterval(defaultInterval);
  datum.setRetriesSinceFetch(0);
  return datum;
}
 
Example 12
Source File: AbstractFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 3 votes vote down vote up
/**
 * This method specifies how to schedule refetching of pages
 * marked as GONE. Default implementation increases fetchInterval by 50%
 * but the value may never exceed <code>maxInterval</code>.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance to be adjusted.
 *
 * @return adjusted page information, including all original information.
 * NOTE: this may be a different instance than @see CrawlDatum, but
 * implementations should make sure that it contains at least all
 * information from @see CrawlDatum.
 */
public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
        long prevFetchTime, long prevModifiedTime, long fetchTime) {
  // no page is truly GONE ... just increase the interval by 50%
  // and try much later.
  if ((datum.getFetchInterval() * 1.5f) < maxInterval)
    datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
  else
    datum.setFetchInterval(maxInterval * 0.9f);
  datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
  return datum;
}
 
Example 13
Source File: AbstractFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 3 votes vote down vote up
/**
 * This method adjusts the fetch schedule if fetching needs to be
 * re-tried due to transient errors. The default implementation
 * sets the next fetch time 1 day in the future and increases
 * the retry counter.
 *
 * @param url URL of the page.
 *
 * @param datum page information.
 *
 * @param prevFetchTime previous fetch time.
 *
 * @param prevModifiedTime previous modified time.
 *
 * @param fetchTime current fetch time.
 *
 * @return adjusted page information, including all original information.
 * NOTE: this may be a different instance than @see CrawlDatum, but
 * implementations should make sure that it contains at least all
 * information from @see CrawlDatum.
 */
public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
        long prevFetchTime, long prevModifiedTime, long fetchTime) {
  datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY*1000);
  datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
  return datum;
}
 
Example 14
Source File: AbstractFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 3 votes vote down vote up
/**
 * This method resets fetchTime, fetchInterval, modifiedTime,
 * retriesSinceFetch and page signature, so that it forces refetching.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance.
 *
 * @param asap if true, force refetch as soon as possible - this sets
 * the fetchTime to now. If false, force refetch whenever the next fetch
 * time is set.
 */
public CrawlDatum  forceRefetch(Text url, CrawlDatum datum, boolean asap) {
  // reduce fetchInterval so that it fits within the max value
  if (datum.getFetchInterval() > maxInterval)
    datum.setFetchInterval(maxInterval * 0.9f);
  datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
  datum.setRetriesSinceFetch(0);
  datum.setSignature(null);
  datum.setModifiedTime(0L);
  if (asap) datum.setFetchTime(System.currentTimeMillis());
  return datum;
}
 
Example 15
Source File: AbstractFetchSchedule.java    From anthelion with Apache License 2.0 3 votes vote down vote up
/**
 * This method adjusts the fetch schedule if fetching needs to be
 * re-tried due to transient errors. The default implementation
 * sets the next fetch time 1 day in the future and increases
 * the retry counter.
 *
 * @param url URL of the page.
 *
 * @param datum page information.
 *
 * @param prevFetchTime previous fetch time.
 *
 * @param prevModifiedTime previous modified time.
 *
 * @param fetchTime current fetch time.
 *
 * @return adjusted page information, including all original information.
 * NOTE: this may be a different instance than @see CrawlDatum, but
 * implementations should make sure that it contains at least all
 * information from @see CrawlDatum.
 */
public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
        long prevFetchTime, long prevModifiedTime, long fetchTime) {
  datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY*1000);
  datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
  return datum;
}
 
Example 16
Source File: AbstractFetchSchedule.java    From anthelion with Apache License 2.0 3 votes vote down vote up
/**
 * This method specifies how to schedule refetching of pages
 * marked as GONE. Default implementation increases fetchInterval by 50%,
 * and if it exceeds the <code>maxInterval</code> it calls
 * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance to be adjusted.
 *
 * @return adjusted page information, including all original information.
 * NOTE: this may be a different instance than @see CrawlDatum, but
 * implementations should make sure that it contains at least all
 * information from @see CrawlDatum.
 */
public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
        long prevFetchTime, long prevModifiedTime, long fetchTime) {
  // no page is truly GONE ... just increase the interval by 50%
  // and try much later.
  datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
  datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
  if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false);
  return datum;
}
 
Example 17
Source File: AbstractFetchSchedule.java    From anthelion with Apache License 2.0 3 votes vote down vote up
/**
 * Initialize fetch schedule related data. Implementations should at least
 * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
 * implementation sets the <code>fetchTime</code> to now, using the
 * default <code>fetchInterval</code>.
 * 
 * @param url URL of the page.
 *
 * @param datum datum instance to be initialized (modified in place).
 */
public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
  datum.setFetchTime(System.currentTimeMillis());
  datum.setFetchInterval(defaultInterval);
  datum.setRetriesSinceFetch(0);
  return datum;
}