Java Code Examples for org.apache.nutch.crawl.CrawlDatum#getFetchTime()

The following examples show how to use org.apache.nutch.crawl.CrawlDatum#getFetchTime() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified
    time = datum.getFetchTime();                // use fetch time
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));

  return doc;
}
 
Example 2
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified specified in HTTP header
    time = datum.getModifiedTime();             // use value in CrawlDatum
    if (time <= 0) {                            // if also unset
      time = datum.getFetchTime();              // use time the fetch took place (fetchTime of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}
 
Example 3
Source File: MimeAdaptiveFetchSchedule.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new MimeAdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;

  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);

  // Set a default MIME-type to test with
  org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
  x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
  p.setMetaData(x);

  p.setFetchTime(0);
  LOG.info(p.toString());

  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }

    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);

    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);

      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");

      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }

    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 4
Source File: AbstractFetchSchedule.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * This method provides information whether the page is suitable for
 * selection in the current fetchlist. NOTE: a true return value does not
 * guarantee that the page will be fetched, it just allows it to be
 * included in the further selection process based on scores. The default
 * implementation checks <code>fetchTime</code>, if it is higher than the
 * <code>curTime</code> it returns false, and true otherwise. It will also
 * check that fetchTime is not too remote (more than <code>maxInterval</code>,
 * in which case it lowers the interval and returns true.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance.
 *
 * @param curTime reference time (usually set to the time when the
 * fetchlist generation process was started).
 *
 * @return true, if the page should be considered for inclusion in the current
 * fetchlist, otherwise false.
 */
public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
  // pages are never truly GONE - we have to check them from time to time.
  // pages with too long fetchInterval are adjusted so that they fit within
  // maximum fetchInterval (segment retention period).
  if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
    if (datum.getFetchInterval() > maxInterval) {
      datum.setFetchInterval(maxInterval * 0.9f);
    }
    datum.setFetchTime(curTime);
  }
  if (datum.getFetchTime() > curTime) {
    return false;                                   // not time yet
  }
  return true;
}
 
Example 5
Source File: AdaptiveFetchSchedule.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new AdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;
  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
  p.setFetchTime(0);
  LOG.info(p.toString());
  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }
    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }
    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 6
Source File: MimeAdaptiveFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new MimeAdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;

  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);

  // Set a default MIME-type to test with
  org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
  x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
  p.setMetaData(x);

  p.setFetchTime(0);
  LOG.info(p.toString());

  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }

    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);

    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);

      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");

      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }

    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 7
Source File: AbstractFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * This method provides information whether the page is suitable for
 * selection in the current fetchlist. NOTE: a true return value does not
 * guarantee that the page will be fetched, it just allows it to be
 * included in the further selection process based on scores. The default
 * implementation checks <code>fetchTime</code>, if it is higher than the
 * <code>curTime</code> it returns false, and true otherwise. It will also
 * check that fetchTime is not too remote (more than <code>maxInterval</code>,
 * in which case it lowers the interval and returns true.
 *
 * @param url URL of the page.
 *
 * @param datum datum instance.
 *
 * @param curTime reference time (usually set to the time when the
 * fetchlist generation process was started).
 *
 * @return true, if the page should be considered for inclusion in the current
 * fetchlist, otherwise false.
 */
public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
  // pages are never truly GONE - we have to check them from time to time.
  // pages with too long fetchInterval are adjusted so that they fit within
  // maximum fetchInterval (segment retention period).
  if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
    if (datum.getFetchInterval() > maxInterval) {
      datum.setFetchInterval(maxInterval * 0.9f);
    }
    datum.setFetchTime(curTime);
  }
  if (datum.getFetchTime() > curTime) {
    return false;                                   // not time yet
  }
  return true;
}
 
Example 8
Source File: AdaptiveFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  FetchSchedule fs = new AdaptiveFetchSchedule();
  fs.setConf(NutchConfiguration.create());
  // we start the time at 0, for simplicity
  long curTime = 0;
  long delta = 1000L * 3600L * 24L; // 2 hours
  // we trigger the update of the page every 30 days
  long update = 1000L * 3600L * 24L * 30L; // 30 days
  boolean changed = true;
  long lastModified = 0;
  int miss = 0;
  int totalMiss = 0;
  int maxMiss = 0;
  int fetchCnt = 0;
  int changeCnt = 0;
  // initial fetchInterval is 10 days
  CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
  p.setFetchTime(0);
  LOG.info(p.toString());
  // let's move the timeline a couple of deltas
  for (int i = 0; i < 10000; i++) {
    if (lastModified + update < curTime) {
      //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
      changed = true;
      changeCnt++;
      lastModified = curTime;
    }
    LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
            + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
    if (p.getFetchTime() <= curTime) {
      fetchCnt++;
      fs.setFetchSchedule(new Text("http://www.example.com"), p,
              p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
              changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
      LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
      if (!changed) miss++;
      if (miss > maxMiss) maxMiss = miss;
      changed = false;
      totalMiss += miss;
      miss = 0;
    }
    if (changed) miss++;
    curTime += delta;
  }
  LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
  LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
 
Example 9
Source File: AbstractFetchSchedule.java    From anthelion with Apache License 2.0 2 votes vote down vote up
/**
 * This method return the last fetch time of the CrawlDatum
 * @return the date as a long.
 */
public long calculateLastFetchTime(CrawlDatum datum) {
  return  datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
}
 
Example 10
Source File: AbstractFetchSchedule.java    From nutch-htmlunit with Apache License 2.0 2 votes vote down vote up
/**
 * This method return the last fetch time of the CrawlDatum
 * @return the date as a long.
 */
public long calculateLastFetchTime(CrawlDatum datum) {
  return  datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
}