Java Code Examples for org.apache.nutch.util.NutchConfiguration#create()

The following examples show how to use org.apache.nutch.util.NutchConfiguration#create() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestCCParseFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}
 
Example 2
Source File: WdcParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	// LOG.setLevel(Level.FINE);
	String name = args[0];
	String url = "file:" + name;
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	Configuration conf = NutchConfiguration.create();
	WdcParser parser = new WdcParser();
	parser.setConf(conf);
	Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
	System.out.println("data: " + parse.getData());

	System.out.println("text: " + parse.getText());

	String contains = parse.getData().getMeta(META_CONTAINS_SEM);
	System.out.println("contains: " + contains);

}
 
Example 3
Source File: NodeReader.java    From nutchpy with Apache License 2.0 6 votes vote down vote up
public static long count(String path) throws IOException  {
    //read rows between start and stop

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);
    System.out.println(file);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable value = (Writable)
            ReflectionUtils.newInstance(reader.getValueClass(), conf);


    //skip rows
    long i = 0;


    while(reader.next(key, value)) {
        i += 1;
    }
    return i;
}
 
Example 4
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}
 
Example 5
Source File: LoopReader.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the LoopReader tool.  For this tool to work the loops job must have
 * already been run on the corresponding WebGraph.
 */
public static void main(String[] args)
  throws Exception {

  Options options = new Options();
  Option helpOpts = OptionBuilder.withArgName("help").withDescription(
    "show this help message").create("help");
  Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()
    .withDescription("the webgraphdb to use").create("webgraphdb");
  Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg()
    .withDescription("the url to dump").create("url");
  options.addOption(helpOpts);
  options.addOption(webGraphOpts);
  options.addOption(urlOpts);

  CommandLineParser parser = new GnuParser();
  try {

    CommandLine line = parser.parse(options, args);
    if (line.hasOption("help") || !line.hasOption("webgraphdb")
      || !line.hasOption("url")) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("WebGraphReader", options);
      return;
    }

    String webGraphDb = line.getOptionValue("webgraphdb");
    String url = line.getOptionValue("url");
    LoopReader reader = new LoopReader(NutchConfiguration.create());
    reader.dumpUrl(new Path(webGraphDb), url);
    return;
  }
  catch (Exception e) {
    e.printStackTrace();
    return;
  }
}
 
Example 6
Source File: TestCrawlDbMerger.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void setUp() throws Exception {
  init1.add(url10);
  init1.add(url11);
  init2.add(url20);
  init2.add(url21);
  long time = System.currentTimeMillis();
  cd1 = new CrawlDatum();
  cd1.setFetchInterval(1.0f);
  cd1.setFetchTime(time);
  cd1.getMetaData().put(new Text("name"), new Text("cd1"));
  cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
  cd2 = new CrawlDatum();
  cd2.setFetchInterval(1.0f);
  cd2.setFetchTime(time + 10000);
  cd2.getMetaData().put(new Text("name"), new Text("cd2"));
  cd3 = new CrawlDatum();
  cd3.setFetchInterval(1.0f);
  cd3.setFetchTime(time + 10000);
  cd3.getMetaData().putAll(cd1.getMetaData());
  cd3.getMetaData().putAll(cd2.getMetaData());
  expected.put(url10, cd3);
  expected.put(url11, cd1);
  expected.put(url21, cd2);
  conf = NutchConfiguration.create();
  fs = FileSystem.get(conf);
  testDir = new Path("test-crawldb-" +
          new java.util.Random().nextInt());
  fs.mkdirs(testDir);
}
 
Example 7
Source File: NodeReader.java    From nutchpy with Apache License 2.0 5 votes vote down vote up
public static List read(String path) throws IOException {
    // reads the entire contents of the file

    List<HashMap> rows=new ArrayList<HashMap>();

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Node value = new Node();

    while(reader.next(key, value)) {
        try {
            HashMap<String, String> t_row = getNodeRow(key,value);
            rows.add(t_row);
        }
        catch (Exception e) {
        }
    }

    return rows;
}
 
Example 8
Source File: TestOOParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : "+expectedText);
  
  for (int i=0; i<sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest")==false) continue;
    
    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    
    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements may differ from what was expected
    // in the previous tests
    assertTrue(text!=null && text.length() > 0);
    
    System.out.println("Found "+sampleFiles[i]+": "+text);
  }
}
 
Example 9
Source File: TestIndexingFilters.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when defined filter does not exist.
 * @throws IndexingException
 */
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
 
Example 10
Source File: ParseData.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public static void main(String argv[]) throws Exception {
  String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
  
  if (argv.length < 3) {
    System.out.println("usage:" + usage);
    return;
  }

  Options opts = new Options();
  Configuration conf = NutchConfiguration.create();
  
  GenericOptionsParser parser =
    new GenericOptionsParser(conf, opts, argv);
  
  String[] remainingArgs = parser.getRemainingArgs();
  FileSystem fs = FileSystem.get(conf);
  
  try {
    int recno = Integer.parseInt(remainingArgs[0]);
    String segment = remainingArgs[1];

    Path file = new Path(segment, DIR_NAME);
    System.out.println("Reading from file: " + file);

    ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf);

    ParseData parseDatum = new ParseData();
    parses.get(recno, parseDatum);

    System.out.println("Retrieved " + recno + " from file " + file);
    System.out.println(parseDatum);

    parses.close();
  } finally {
    fs.close();
  }
}
 
Example 11
Source File: TestParserFactory.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/** Inits the Test Case with the test parse-plugin file */
protected void setUp() throws Exception {
    conf = NutchConfiguration.create();
    conf.set("plugin.includes", ".*");
    conf.set("parse.plugin.file",
             "org/apache/nutch/parse/parse-plugin-test.xml");
    parserFactory = new ParserFactory(conf);
}
 
Example 12
Source File: ParseText.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public static void main(String argv[]) throws Exception {
  String usage = "ParseText (-local | -dfs <namenode:port>) recno segment";

  if (argv.length < 3) {
    System.out.println("usage:" + usage);
    return;
  }
  Options opts = new Options();
  Configuration conf = NutchConfiguration.create();
  
  GenericOptionsParser parser =
    new GenericOptionsParser(conf, opts, argv);
  
  String[] remainingArgs = parser.getRemainingArgs();
  
  FileSystem fs = FileSystem.get(conf);
  try {
    int recno = Integer.parseInt(remainingArgs[0]);
    String segment = remainingArgs[1];
    String filename = new Path(segment, ParseText.DIR_NAME).toString();

    ParseText parseText = new ParseText();
    ArrayFile.Reader parseTexts = new ArrayFile.Reader(fs, filename, conf);

    parseTexts.get(recno, parseText);
    System.out.println("Retrieved " + recno + " from file " + filename);
    System.out.println(parseText);
    parseTexts.close();
  } finally {
    fs.close();
  }
}
 
Example 13
Source File: TestSignatureFactory.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testGetSignature() {
  Configuration conf=NutchConfiguration.create();
  Signature signature1=SignatureFactory.getSignature(conf);
  Signature signature2=SignatureFactory.getSignature(conf);
  assertNotNull(signature1);
  assertNotNull(signature2);
  assertEquals(signature1, signature2);
}
 
Example 14
Source File: NodeReader.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the NodeReader tool.  The command line arguments must contain a 
 * webgraphdb path and a url.  The url must match the normalized url that is
 * contained in the NodeDb of the WebGraph.
 */
public static void main(String[] args)
  throws Exception {

  Options options = new Options();
  OptionBuilder.withArgName("help");
  OptionBuilder.withDescription("show this help message");
  Option helpOpts = OptionBuilder.create("help");
  options.addOption(helpOpts);
  
  OptionBuilder.withArgName("webgraphdb");
  OptionBuilder.hasArg();
  OptionBuilder.withDescription("the webgraphdb to use");
  Option webGraphOpts = OptionBuilder.create("webgraphdb");
  options.addOption(webGraphOpts);
  
  OptionBuilder.withArgName("url");
  OptionBuilder.hasOptionalArg();
  OptionBuilder.withDescription("the url to dump");
  Option urlOpts = OptionBuilder.create("url");
  options.addOption(urlOpts);

  CommandLineParser parser = new GnuParser();
  try {

    // command line must take a webgraphdb and a url
    CommandLine line = parser.parse(options, args);
    if (line.hasOption("help") || !line.hasOption("webgraphdb")
      || !line.hasOption("url")) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp("WebGraphReader", options);
      return;
    }

    // dump the values to system out and return
    String webGraphDb = line.getOptionValue("webgraphdb");
    String url = line.getOptionValue("url");
    NodeReader reader = new NodeReader(NutchConfiguration.create());
    reader.dumpUrl(new Path(webGraphDb), url);
    
    return;
  }
  catch (Exception e) {
    e.printStackTrace();
    return;
  }
}
 
Example 15
Source File: SubcollectionIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public SubcollectionIndexingFilter(){
  super(NutchConfiguration.create());
}
 
Example 16
Source File: SubcollectionIndexingFilter.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public SubcollectionIndexingFilter(){
  super(NutchConfiguration.create());
}
 
Example 17
Source File: LinkReader.java    From nutchpy with Apache License 2.0 4 votes vote down vote up
public static List slice(long start, long stop, String path) throws IOException  {
    // reads the entire contents of the file

    List<HashMap> rows=new ArrayList<HashMap>();

    Configuration conf = NutchConfiguration.create();
    FileSystem fs = FileSystem.get(conf);

    Path file = new Path(path);

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);

    Writable key = (Writable)
            ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    LinkDatum value = new LinkDatum();

    //skip rows
    long i = 0;
    while(reader.next(key, value)) {
        if (i == start) {
            break;
        }
        i += 1;
    }

    while(reader.next(key, value)) {
        if (i == stop) {
            break;
        }

        i += 1;

        try {
            HashMap<String, String> t_row = getLinksRow(key,value);
            rows.add(t_row);

        }
        catch (Exception e) {
        }
    }

    return rows;
}
 
Example 18
Source File: CollectionManager.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/** 
 * Used for testing
 */
protected CollectionManager(){
  super(NutchConfiguration.create());
}
 
Example 19
Source File: SegmentReader.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    usage();
    return;
  }
  int mode = -1;
  if (args[0].equals("-dump"))
    mode = MODE_DUMP;
  else if (args[0].equals("-list"))
    mode = MODE_LIST;
  else if (args[0].equals("-get")) mode = MODE_GET;

  boolean co = true;
  boolean fe = true;
  boolean ge = true;
  boolean pa = true;
  boolean pd = true;
  boolean pt = true;
  // collect general options
  for (int i = 1; i < args.length; i++) {
    if (args[i].equals("-nocontent")) {
      co = false;
      args[i] = null;
    } else if (args[i].equals("-nofetch")) {
      fe = false;
      args[i] = null;
    } else if (args[i].equals("-nogenerate")) {
      ge = false;
      args[i] = null;
    } else if (args[i].equals("-noparse")) {
      pa = false;
      args[i] = null;
    } else if (args[i].equals("-noparsedata")) {
      pd = false;
      args[i] = null;
    } else if (args[i].equals("-noparsetext")) {
      pt = false;
      args[i] = null;
    }
  }
  Configuration conf = NutchConfiguration.create();
  final FileSystem fs = FileSystem.get(conf);
  SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd, pt);
  // collect required args
  switch (mode) {
    case MODE_DUMP:
      String input = args[1];
      if (input == null) {
        System.err.println("Missing required argument: <segment_dir>");
        usage();
        return;
      }
      String output = args.length > 2 ? args[2] : null;
      if (output == null) {
        System.err.println("Missing required argument: <output>");
        usage();
        return;
      }
      segmentReader.dump(new Path(input), new Path(output));
      return;
    case MODE_LIST:
      ArrayList<Path> dirs = new ArrayList<Path>();
      for (int i = 1; i < args.length; i++) {
        if (args[i] == null) continue;
        if (args[i].equals("-dir")) {
          Path dir = new Path(args[++i]);
          FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
          Path[] files = HadoopFSUtil.getPaths(fstats);
          if (files != null && files.length > 0) {
            dirs.addAll(Arrays.asList(files));
          }
        } else dirs.add(new Path(args[i]));
      }
      segmentReader.list(dirs, new OutputStreamWriter(System.out, "UTF-8"));
      return;
    case MODE_GET:
      input = args[1];
      if (input == null) {
        System.err.println("Missing required argument: <segment_dir>");
        usage();
        return;
      }
      String key = args.length > 2 ? args[2] : null;
      if (key == null) {
        System.err.println("Missing required argument: <keyValue>");
        usage();
        return;
      }
      segmentReader.get(new Path(input), new Text(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap<String, List<Writable>>());
      return;
    default:
      System.err.println("Invalid operation: " + args[0]);
      usage();
      return;
  }
}
 
Example 20
Source File: SegmentReader.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    usage();
    return;
  }
  int mode = -1;
  if (args[0].equals("-dump"))
    mode = MODE_DUMP;
  else if (args[0].equals("-list"))
    mode = MODE_LIST;
  else if (args[0].equals("-get")) mode = MODE_GET;

  boolean co = true;
  boolean fe = true;
  boolean ge = true;
  boolean pa = true;
  boolean pd = true;
  boolean pt = true;
  // collect general options
  for (int i = 1; i < args.length; i++) {
    if (args[i].equals("-nocontent")) {
      co = false;
      args[i] = null;
    } else if (args[i].equals("-nofetch")) {
      fe = false;
      args[i] = null;
    } else if (args[i].equals("-nogenerate")) {
      ge = false;
      args[i] = null;
    } else if (args[i].equals("-noparse")) {
      pa = false;
      args[i] = null;
    } else if (args[i].equals("-noparsedata")) {
      pd = false;
      args[i] = null;
    } else if (args[i].equals("-noparsetext")) {
      pt = false;
      args[i] = null;
    }
  }
  Configuration conf = NutchConfiguration.create();
  final FileSystem fs = FileSystem.get(conf);
  SegmentReader segmentReader = new SegmentReader(conf, co, fe, ge, pa, pd, pt);
  // collect required args
  switch (mode) {
    case MODE_DUMP:
      String input = args[1];
      if (input == null) {
        System.err.println("Missing required argument: <segment_dir>");
        usage();
        return;
      }
      String output = args.length > 2 ? args[2] : null;
      if (output == null) {
        System.err.println("Missing required argument: <output>");
        usage();
        return;
      }
      segmentReader.dump(new Path(input), new Path(output));
      return;
    case MODE_LIST:
      ArrayList<Path> dirs = new ArrayList<Path>();
      for (int i = 1; i < args.length; i++) {
        if (args[i] == null) continue;
        if (args[i].equals("-dir")) {
          Path dir = new Path(args[++i]);
          FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
          Path[] files = HadoopFSUtil.getPaths(fstats);
          if (files != null && files.length > 0) {
            dirs.addAll(Arrays.asList(files));
          }
        } else dirs.add(new Path(args[i]));
      }
      segmentReader.list(dirs, new OutputStreamWriter(System.out, "UTF-8"));
      return;
    case MODE_GET:
      input = args[1];
      if (input == null) {
        System.err.println("Missing required argument: <segment_dir>");
        usage();
        return;
      }
      String key = args.length > 2 ? args[2] : null;
      if (key == null) {
        System.err.println("Missing required argument: <keyValue>");
        usage();
        return;
      }
      segmentReader.get(new Path(input), new Text(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap<String, List<Writable>>());
      return;
    default:
      System.err.println("Invalid operation: " + args[0]);
      usage();
      return;
  }
}