* Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.nutch.scoring.webgraph;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.FSUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;

 * Creates three databases, one for inlinks, one for outlinks, and a node
 * database that holds the number of in and outlinks to a url and the current
 * score for the url.
 * The score is set by an analysis program such as LinkRank. The WebGraph is an
 * update-able database. Outlinks are stored by their fetch time or by the
 * current system time if no fetch time is available. Only the most recent
 * version of outlinks for a given url is stored. As more crawls are executed
 * and the WebGraph updated, newer Outlinks will replace older Outlinks. This
 * allows the WebGraph to adapt to changes in the link structure of the web.
 * The Inlink database is created from the Outlink database and is regenerated
 * when the WebGraph is updated. The Node database is created from both the
 * Inlink and Outlink databases. Because the Node database is overwritten when
 * the WebGraph is updated and because the Node database holds current scores
 * for urls it is recommended that a crawl-cyle (one or more full crawls) fully
 * complete before the WebGraph is updated and some type of analysis, such as
 * LinkRank, is run to update scores in the Node database in a stable fashion.
public class WebGraph
  extends Configured
  implements Tool {

  public static final Logger LOG = LoggerFactory.getLogger(WebGraph.class);
  public static final String LOCK_NAME = ".locked";
  public static final String INLINK_DIR = "inlinks";
  public static final String OUTLINK_DIR = "outlinks/current";
  public static final String OLD_OUTLINK_DIR = "outlinks/old";
  public static final String NODE_DIR = "nodes";

   * The OutlinkDb creates a database of all outlinks. Outlinks to internal urls
   * by domain and host can be ignored. The number of Outlinks out to a given
   * page or domain can also be limited.
  public static class OutlinkDb
    extends Configured
    implements Mapper<Text, Writable, Text, NutchWritable>,
    Reducer<Text, NutchWritable, Text, LinkDatum> {

    public static final String URL_NORMALIZING = "webgraph.url.normalizers";
    public static final String URL_FILTERING = "webgraph.url.filters";

    // ignoring internal domains, internal hosts
    private boolean ignoreDomain = true;
    private boolean ignoreHost = true;

    // limiting urls out to a page or to a domain
    private boolean limitPages = true;
    private boolean limitDomains = true;

    // using normalizers and/or filters
    private boolean normalize = false;
    private boolean filter = false;

    // url normalizers, filters and job configuration
    private URLNormalizers urlNormalizers;
    private URLFilters filters;
    private JobConf conf;

     * Normalizes and trims extra whitespace from the given url.
     * @param url The url to normalize.
     * @return The normalized url.
    private String normalizeUrl(String url) {

      if (!normalize) {
        return url;

      String normalized = null;
      if (urlNormalizers != null) {
        try {

          // normalize and trim the url
          normalized = urlNormalizers.normalize(url,
          normalized = normalized.trim();
        catch (Exception e) {
          LOG.warn("Skipping " + url + ":" + e);
          normalized = null;
      return normalized;

     * Filters the given url.
     * @param url The url to filter.
     * @return The filtered url or null.
    private String filterUrl(String url) {

      if (!filter) {
        return url;

      try {
        url = filters.filter(url);
      } catch (Exception e) {
        url = null;

      return url;

     * Returns the fetch time from the parse data or the current system time if
     * the fetch time doesn't exist.
     * @param data The parse data.
     * @return The fetch time as a long.
    private long getFetchTime(ParseData data) {

      // default to current system time
      long fetchTime = System.currentTimeMillis();
      String fetchTimeStr = data.getContentMeta().get(Nutch.FETCH_TIME_KEY);
      try {

        // get the fetch time from the parse data
        fetchTime = Long.parseLong(fetchTimeStr);
      catch (Exception e) {
        fetchTime = System.currentTimeMillis();
      return fetchTime;

     * Default constructor.
    public OutlinkDb() {

     * Configurable constructor.
    public OutlinkDb(Configuration conf) {

     * Configures the OutlinkDb job. Sets up internal links and link limiting.
    public void configure(JobConf conf) {
      this.conf = conf;
      ignoreHost = conf.getBoolean("link.ignore.internal.host", true);
      ignoreDomain = conf.getBoolean("link.ignore.internal.domain", true);
      limitPages = conf.getBoolean("link.ignore.limit.page", true);
      limitDomains = conf.getBoolean("link.ignore.limit.domain", true);

      normalize = conf.getBoolean(URL_NORMALIZING, false);
      filter = conf.getBoolean(URL_FILTERING, false);

      if (normalize) {
        urlNormalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);

      if (filter) {
        filters = new URLFilters(conf);

     * Passes through existing LinkDatum objects from an existing OutlinkDb and
     * maps out new LinkDatum objects from new crawls ParseData.
    public void map(Text key, Writable value,
      OutputCollector<Text, NutchWritable> output, Reporter reporter)
      throws IOException {

      // normalize url, stop processing if null
      String url = normalizeUrl(key.toString());
      if (url == null) {

      // filter url
      if (filterUrl(url) == null) {

      // Overwrite the key with the normalized URL

      if (value instanceof CrawlDatum) {
        CrawlDatum datum = (CrawlDatum)value;

        if (datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
            datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {

            // Tell the reducer to get rid of all instances of this key
            output.collect(key, new NutchWritable(new BooleanWritable(true)));
      else if (value instanceof ParseData) {
        // get the parse data and the outlinks from the parse data, along with
        // the fetch time for those links
        ParseData data = (ParseData)value;
        long fetchTime = getFetchTime(data);
        Outlink[] outlinkAr = data.getOutlinks();
        Map<String, String> outlinkMap = new LinkedHashMap<String, String>();

        // normalize urls and put into map
        if (outlinkAr != null && outlinkAr.length > 0) {
          for (int i = 0; i < outlinkAr.length; i++) {
            Outlink outlink = outlinkAr[i];
            String toUrl = normalizeUrl(outlink.getToUrl());

            if (filterUrl(toUrl) == null) {

            // only put into map if the url doesn't already exist in the map or
            // if it does and the anchor for that link is null, will replace if
            // url is existing
            boolean existingUrl = outlinkMap.containsKey(toUrl);
            if (toUrl != null
              && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
              outlinkMap.put(toUrl, outlink.getAnchor());

        // collect the outlinks under the fetch time
        for (String outlinkUrl : outlinkMap.keySet()) {
          String anchor = outlinkMap.get(outlinkUrl);
          LinkDatum datum = new LinkDatum(outlinkUrl, anchor, fetchTime);
          output.collect(key, new NutchWritable(datum));
      else if (value instanceof LinkDatum) {
        LinkDatum datum = (LinkDatum)value;
        String linkDatumUrl = normalizeUrl(datum.getUrl());

        if (filterUrl(linkDatumUrl) != null) {

          // collect existing outlinks from existing OutlinkDb
          output.collect(key, new NutchWritable(datum));

    public void reduce(Text key, Iterator<NutchWritable> values,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      // aggregate all outlinks, get the most recent timestamp for a fetch
      // which should be the timestamp for all of the most recent outlinks
      long mostRecent = 0L;
      List<LinkDatum> outlinkList = new ArrayList<LinkDatum>();
      while (values.hasNext()) {
        Writable value = values.next().get();

        if (value instanceof LinkDatum) {
          // loop through, change out most recent timestamp if needed
          LinkDatum next = (LinkDatum)value;
          long timestamp = next.getTimestamp();
          if (mostRecent == 0L || mostRecent < timestamp) {
            mostRecent = timestamp;
          outlinkList.add(WritableUtils.clone(next, conf));
          reporter.incrCounter("WebGraph.outlinks", "added links", 1);
        else if (value instanceof BooleanWritable) {
          BooleanWritable delete = (BooleanWritable)value;
          // Actually, delete is always true, otherwise we don't emit it in the mapper in the first place
          if (delete.get() == true) {
            // This page is gone, do not emit it's outlinks
            reporter.incrCounter("WebGraph.outlinks", "removed links", 1);

      // get the url, domain, and host for the url
      String url = key.toString();
      String domain = URLUtil.getDomainName(url);
      String host = URLUtil.getHost(url);

      // setup checking sets for domains and pages
      Set<String> domains = new HashSet<String>();
      Set<String> pages = new HashSet<String>();

      // loop through the link datums
      for (LinkDatum datum : outlinkList) {

        // get the url, host, domain, and page for each outlink
        String toUrl = datum.getUrl();
        String toDomain = URLUtil.getDomainName(toUrl);
        String toHost = URLUtil.getHost(toUrl);
        String toPage = URLUtil.getPage(toUrl);

        // outlinks must be the most recent and conform to internal url and
        // limiting rules, if it does collect it
        if (datum.getTimestamp() == mostRecent
          && (!limitPages || (limitPages && !pages.contains(toPage)))
          && (!limitDomains || (limitDomains && !domains.contains(toDomain)))
          && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host)))
          && (!ignoreDomain || (ignoreDomain && !toDomain.equalsIgnoreCase(domain)))) {
          output.collect(key, datum);

    public void close() {

   * The InlinkDb creates a database of Inlinks. Inlinks are inverted from the
   * OutlinkDb LinkDatum objects and are regenerated each time the WebGraph is
   * updated.
  private static class InlinkDb
    extends Configured
    implements Mapper<Text, LinkDatum, Text, LinkDatum> {

    private long timestamp;

     * Configures job. Sets timestamp for all Inlink LinkDatum objects to the
     * current system time.
    public void configure(JobConf conf) {
      timestamp = System.currentTimeMillis();

    public void close() {

     * Inverts the Outlink LinkDatum objects into new LinkDatum objects with a
     * new system timestamp, type and to and from url switched.
    public void map(Text key, LinkDatum datum,
      OutputCollector<Text, LinkDatum> output, Reporter reporter)
      throws IOException {

      // get the to and from url and the anchor
      String fromUrl = key.toString();
      String toUrl = datum.getUrl();
      String anchor = datum.getAnchor();

      // flip the from and to url and set the new link type
      LinkDatum inlink = new LinkDatum(fromUrl, anchor, timestamp);
      output.collect(new Text(toUrl), inlink);

   * Creates the Node database which consists of the number of in and outlinks
   * for each url and a score slot for analysis programs such as LinkRank.
  private static class NodeDb
    extends Configured
    implements Reducer<Text, LinkDatum, Text, Node> {

     * Configures job.
    public void configure(JobConf conf) { }

    public void close() { }

     * Counts the number of inlinks and outlinks for each url and sets a default
     * score of 0.0 for each url (node) in the webgraph.
    public void reduce(Text key, Iterator<LinkDatum> values,
      OutputCollector<Text, Node> output, Reporter reporter)
      throws IOException {

      Node node = new Node();
      int numInlinks = 0;
      int numOutlinks = 0;

      // loop through counting number of in and out links
      while (values.hasNext()) {
        LinkDatum next = values.next();
        if (next.getLinkType() == LinkDatum.INLINK) {
        else if (next.getLinkType() == LinkDatum.OUTLINK) {

      // set the in and outlinks and a default score of 0
      output.collect(key, node);

   * Creates the three different WebGraph databases, Outlinks, Inlinks, and
   * Node. If a current WebGraph exists then it is updated, if it doesn't exist
   * then a new WebGraph database is created.
   * @param webGraphDb The WebGraph to create or update.
   * @param segments The array of segments used to update the WebGraph. Newer
   * segments and fetch times will overwrite older segments.
   * @param normalize whether to use URLNormalizers on URL's in the segment
   * @param filter whether to use URLFilters on URL's in the segment
   * @throws IOException If an error occurs while processing the WebGraph.
  public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter)
    throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("WebGraphDb: starting at " + sdf.format(start));
      LOG.info("WebGraphDb: webgraphdb: " + webGraphDb);
      LOG.info("WebGraphDb: URL normalize: " + normalize);
      LOG.info("WebGraphDb: URL filter: " + filter);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // lock an existing webgraphdb to prevent multiple simultaneous updates
    Path lock = new Path(webGraphDb, LOCK_NAME);
    if (!fs.exists(webGraphDb)) {

    LockUtil.createLockFile(fs, lock, false);

    // outlink and temp outlink database paths
    Path outlinkDb = new Path(webGraphDb, OUTLINK_DIR);
    Path oldOutlinkDb = new Path(webGraphDb, OLD_OUTLINK_DIR);

    if (!fs.exists(outlinkDb)) {

    Path tempOutlinkDb = new Path(outlinkDb + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    JobConf outlinkJob = new NutchJob(conf);
    outlinkJob.setJobName("Outlinkdb: " + outlinkDb);

    boolean deleteGone = conf.getBoolean("link.delete.gone", false);
    boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);

    if (deleteGone) {
      LOG.info("OutlinkDb: deleting gone links");

    // get the parse data and crawl fetch data for all segments
    if (segments != null) {
      for (int i = 0; i < segments.length; i++) {
        Path parseData = new Path(segments[i], ParseData.DIR_NAME);
        if (fs.exists(parseData)) {
          LOG.info("OutlinkDb: adding input: " + parseData);
          FileInputFormat.addInputPath(outlinkJob, parseData);

        if (deleteGone) {
          Path crawlFetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
          if (fs.exists(crawlFetch)) {
            LOG.info("OutlinkDb: adding input: " + crawlFetch);
            FileInputFormat.addInputPath(outlinkJob, crawlFetch);

    // add the existing webgraph
    LOG.info("OutlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(outlinkJob, outlinkDb);

    outlinkJob.setBoolean(OutlinkDb.URL_NORMALIZING, normalize);
    outlinkJob.setBoolean(OutlinkDb.URL_FILTERING, filter);

    FileOutputFormat.setOutputPath(outlinkJob, tempOutlinkDb);
    outlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the outlinkdb job and replace any old outlinkdb with the new one
    try {
      LOG.info("OutlinkDb: running");
      LOG.info("OutlinkDb: installing " + outlinkDb);
      FSUtils.replace(fs, oldOutlinkDb, outlinkDb, true);
      FSUtils.replace(fs, outlinkDb, tempOutlinkDb, true);
      if (!preserveBackup && fs.exists(oldOutlinkDb)) fs.delete(oldOutlinkDb, true);
      LOG.info("OutlinkDb: finished");
    catch (IOException e) {
      // remove lock file and and temporary directory if an error occurs
      LockUtil.removeLockFile(fs, lock);
      if (fs.exists(tempOutlinkDb)) {
        fs.delete(tempOutlinkDb, true);
      throw e;

    // inlink and temp link database paths
    Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
    Path tempInlinkDb = new Path(inlinkDb + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf inlinkJob = new NutchJob(conf);
    inlinkJob.setJobName("Inlinkdb " + inlinkDb);
    LOG.info("InlinkDb: adding input: " + outlinkDb);
    FileInputFormat.addInputPath(inlinkJob, outlinkDb);
    FileOutputFormat.setOutputPath(inlinkJob, tempInlinkDb);
    inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    try {
      // run the inlink and replace any old with new
      LOG.info("InlinkDb: running");
      LOG.info("InlinkDb: installing " + inlinkDb);
      FSUtils.replace(fs, inlinkDb, tempInlinkDb, true);
      LOG.info("InlinkDb: finished");
    catch (IOException e) {
      // remove lock file and and temporary directory if an error occurs
      LockUtil.removeLockFile(fs, lock);
      if (fs.exists(tempInlinkDb)) {
        fs.delete(tempInlinkDb, true);
      throw e;

    // node and temp node database paths
    Path nodeDb = new Path(webGraphDb, NODE_DIR);
    Path tempNodeDb = new Path(nodeDb + "-"
      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf nodeJob = new NutchJob(conf);
    nodeJob.setJobName("NodeDb " + nodeDb);
    LOG.info("NodeDb: adding input: " + outlinkDb);
    LOG.info("NodeDb: adding input: " + inlinkDb);
    FileInputFormat.addInputPath(nodeJob, outlinkDb);
    FileInputFormat.addInputPath(nodeJob, inlinkDb);
    FileOutputFormat.setOutputPath(nodeJob, tempNodeDb);
    nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    try {
      // run the node job and replace old nodedb with new
      LOG.info("NodeDb: running");
      LOG.info("NodeDb: installing " + nodeDb);
      FSUtils.replace(fs, nodeDb, tempNodeDb, true);
      LOG.info("NodeDb: finished");
    catch (IOException e) {
      // remove lock file and and temporary directory if an error occurs
      LockUtil.removeLockFile(fs, lock);
      if (fs.exists(tempNodeDb)) {
        fs.delete(tempNodeDb, true);
      throw e;

    // remove the lock file for the webgraph
    LockUtil.removeLockFile(fs, lock);

    long end = System.currentTimeMillis();
    LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

  public static void main(String[] args)
    throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new WebGraph(), args);

   * Parses command link arguments and runs the WebGraph jobs.
  public int run(String[] args)
    throws Exception {

    Options options = new Options();
    OptionBuilder.withDescription("show this help message");
    Option helpOpts = OptionBuilder.create("help");
    OptionBuilder.withDescription("the web graph database to use");
    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
    OptionBuilder.withDescription("the segment(s) to use");
    Option segOpts = OptionBuilder.create("segment");
    OptionBuilder.withDescription("the segment directory to use");
    Option segDirOpts = OptionBuilder.create("segmentDir");
    OptionBuilder.withDescription("whether to use URLNormalizers on the URL's in the segment");
    Option normalizeOpts = OptionBuilder.create("normalize");
    OptionBuilder.withDescription("whether to use URLFilters on the URL's in the segment");
    Option filterOpts = OptionBuilder.create("filter");

    CommandLineParser parser = new GnuParser();
    try {

      CommandLine line = parser.parse(options, args);
      if (line.hasOption("help") || !line.hasOption("webgraphdb")
        || (!line.hasOption("segment") && !line.hasOption("segmentDir"))) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("WebGraph", options);
        return -1;

      String webGraphDb = line.getOptionValue("webgraphdb");

      Path[] segPaths = null;

      // Handle segment option
      if (line.hasOption("segment")) {
        String[] segments = line.getOptionValues("segment");
        segPaths = new Path[segments.length];
        for (int i = 0; i < segments.length; i++) {
          segPaths[i] = new Path(segments[i]);

      // Handle segmentDir option
      if (line.hasOption("segmentDir")) {
        Path dir = new Path(line.getOptionValue("segmentDir"));
        FileSystem fs = dir.getFileSystem(getConf());
        FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
        segPaths = HadoopFSUtil.getPaths(fstats);

      boolean normalize = false;

      if (line.hasOption("normalize")) {
        normalize = true;

      boolean filter = false;

      if (line.hasOption("filter")) {
        filter = true;

      createWebGraph(new Path(webGraphDb), segPaths, normalize, filter);
      return 0;
    catch (Exception e) {
      LOG.error("WebGraph: " + StringUtils.stringifyException(e));
      return -2;