org.apache.hadoop.mapred.SequenceFileOutputFormat Java Exaples

Source File: DataFsck.java From RDFS with Apache License 2.0

6 votes

private JobConf createJobConf() {
  JobConf jobConf = new JobConf(getConf());
  String jobName = NAME + " " + dateForm.format(new Date(System.currentTimeMillis()));
  jobConf.setJobName(jobName);
  jobConf.setMapSpeculativeExecution(false);

  jobConf.setJarByClass(DataFsck.class);
  jobConf.setInputFormat(DataFsckInputFormat.class);
  jobConf.setOutputFormat(SequenceFileOutputFormat.class);
  jobConf.setOutputKeyClass(Text.class);
  jobConf.setOutputValueClass(Text.class);

  jobConf.setMapperClass(DataFsckMapper.class);
  jobConf.setNumReduceTasks(0);
  return jobConf;
}

Source File: MultipleSequenceFileOutputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs,
                                                 JobConf job,
                                                 String name,
                                                 Progressable arg3) 
throws IOException {
  if (theSequenceFileOutputFormat == null) {
    theSequenceFileOutputFormat = new SequenceFileOutputFormat<K,V>();
  }
  return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: MultipleSequenceFileOutputFormat.java From hadoop with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs,
                                                 JobConf job,
                                                 String name,
                                                 Progressable arg3) 
throws IOException {
  if (theSequenceFileOutputFormat == null) {
    theSequenceFileOutputFormat = new SequenceFileOutputFormat<K,V>();
  }
  return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: DataFsck.java From RDFS with Apache License 2.0

5 votes

List<SequenceFile.Reader> getOutputs(List<JobContext> submitted) throws IOException {
  List<SequenceFile.Reader> outputs = new ArrayList<SequenceFile.Reader>();
  for (JobContext ctx: submitted) {
    SequenceFile.Reader[] jobOutputs = SequenceFileOutputFormat.getReaders(
      getConf(),
      SequenceFileOutputFormat.getOutputPath(ctx.jobConf));
    for (SequenceFile.Reader r: jobOutputs) {
      outputs.add(r);
    }
  }
  return outputs;
}

Source File: MultipleSequenceFileOutputFormat.java From big-c with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs,
                                                 JobConf job,
                                                 String name,
                                                 Progressable arg3) 
throws IOException {
  if (theSequenceFileOutputFormat == null) {
    theSequenceFileOutputFormat = new SequenceFileOutputFormat<K,V>();
  }
  return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: MultipleSequenceFileOutputFormat.java From RDFS with Apache License 2.0

5 votes

@Override
protected RecordWriter<K, V> getBaseRecordWriter(FileSystem fs,
                                                 JobConf job,
                                                 String name,
                                                 Progressable arg3) 
throws IOException {
  if (theSequenceFileOutputFormat == null) {
    theSequenceFileOutputFormat = new SequenceFileOutputFormat<K,V>();
  }
  return theSequenceFileOutputFormat.getRecordWriter(fs, job, name, arg3);
}

Source File: CloudBurst.java From emr-sample-apps with Apache License 2.0

5 votes

public static void filter(String alignpath, 
	                  String outpath,
                            int nummappers,
                            int numreducers) throws IOException, Exception
  {
System.out.println("NUM_FMAP_TASKS: "     + nummappers);
System.out.println("NUM_FREDUCE_TASKS: "  + numreducers);

JobConf conf = new JobConf(FilterAlignments.class);
conf.setJobName("FilterAlignments");
conf.setNumMapTasks(nummappers);
conf.setNumReduceTasks(numreducers);

FileInputFormat.addInputPath(conf, new Path(alignpath));

conf.setMapperClass(FilterMapClass.class);

conf.setInputFormat(SequenceFileInputFormat.class);			
conf.setMapOutputKeyClass(IntWritable.class);
conf.setMapOutputValueClass(BytesWritable.class);

conf.setCombinerClass(FilterCombinerClass.class);

conf.setReducerClass(FilterReduceClass.class);		
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(BytesWritable.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);

Path oPath = new Path(outpath);
FileOutputFormat.setOutputPath(conf, oPath);
System.err.println("  Removing old results");
FileSystem.get(conf).delete(oPath);

JobClient.runJob(conf);

System.err.println("FilterAlignments Finished");		
  }

Source File: SequenceFile.java From sparkResearch with Apache License 2.0

5 votes

protected static void run(JavaSparkContext sparkContext) {
    JavaPairRDD<Text, IntWritable> javaPairRDD = sparkContext.sequenceFile("url", Text.class, IntWritable.class);
    JavaPairRDD<String, Integer> pairRDD = javaPairRDD.mapToPair(new sequenceToConvert());
    //写
    pairRDD.saveAsHadoopFile("url",Text.class,IntWritable.class,SequenceFileOutputFormat.class);
}

Source File: PerMapOutputFormat.java From webarchive-commons with Apache License 2.0

5 votes

private OutputFormat<K,V> getOutputFormat( JobConf job )
{
  return ReflectionUtils.newInstance( job.getClass( "permap.output.format.class",
                                                     SequenceFileOutputFormat.class,
                                                     OutputFormat.class ),
                                      job );
}

Source File: LinkRank.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
  throws IOException {

  // configure the inverter
  JobConf inverter = new NutchJob(getConf());
  inverter.setJobName("LinkAnalysis Inverter");
  FileInputFormat.addInputPath(inverter, nodeDb);
  FileInputFormat.addInputPath(inverter, outlinkDb);

  // add the loop database if it exists, isn't null
  if (loopDb != null) {
    FileInputFormat.addInputPath(inverter, loopDb);
  }
  FileOutputFormat.setOutputPath(inverter, output);
  inverter.setInputFormat(SequenceFileInputFormat.class);
  inverter.setMapperClass(Inverter.class);
  inverter.setReducerClass(Inverter.class);
  inverter.setMapOutputKeyClass(Text.class);
  inverter.setMapOutputValueClass(ObjectWritable.class);
  inverter.setOutputKeyClass(Text.class);
  inverter.setOutputValueClass(LinkDatum.class);
  inverter.setOutputFormat(SequenceFileOutputFormat.class);
  inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the inverter job
  LOG.info("Starting inverter job");
  try {
    JobClient.runJob(inverter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished inverter job.");
}

Source File: LinkRank.java From anthelion with Apache License 2.0

5 votes

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output)
  throws IOException {

  // configure the inverter
  JobConf inverter = new NutchJob(getConf());
  inverter.setJobName("LinkAnalysis Inverter");
  FileInputFormat.addInputPath(inverter, nodeDb);
  FileInputFormat.addInputPath(inverter, outlinkDb);

  // add the loop database if it exists, isn't null
  if (loopDb != null) {
    FileInputFormat.addInputPath(inverter, loopDb);
  }
  FileOutputFormat.setOutputPath(inverter, output);
  inverter.setInputFormat(SequenceFileInputFormat.class);
  inverter.setMapperClass(Inverter.class);
  inverter.setReducerClass(Inverter.class);
  inverter.setMapOutputKeyClass(Text.class);
  inverter.setMapOutputValueClass(ObjectWritable.class);
  inverter.setOutputKeyClass(Text.class);
  inverter.setOutputValueClass(LinkDatum.class);
  inverter.setOutputFormat(SequenceFileOutputFormat.class);
  inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

  // run the inverter job
  LOG.info("Starting inverter job");
  try {
    JobClient.runJob(inverter);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  LOG.info("Finished inverter job.");
}

Source File: HiveWriterFactory.java From flink with Apache License 2.0

5 votes

/**
 * Create a {@link RecordWriter} from path.
 */
public RecordWriter createRecordWriter(Path path) {
	try {
		checkInitialize();
		JobConf conf = new JobConf(confWrapper.conf());

		if (isCompressed) {
			String codecStr = conf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC.varname);
			if (!StringUtils.isNullOrWhitespaceOnly(codecStr)) {
				//noinspection unchecked
				Class<? extends CompressionCodec> codec =
						(Class<? extends CompressionCodec>) Class.forName(codecStr, true,
								Thread.currentThread().getContextClassLoader());
				FileOutputFormat.setOutputCompressorClass(conf, codec);
			}
			String typeStr = conf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE.varname);
			if (!StringUtils.isNullOrWhitespaceOnly(typeStr)) {
				SequenceFile.CompressionType style = SequenceFile.CompressionType.valueOf(typeStr);
				SequenceFileOutputFormat.setOutputCompressionType(conf, style);
			}
		}

		return hiveShim.getHiveRecordWriter(
				conf,
				hiveOutputFormatClz,
				recordSerDe.getSerializedClass(),
				isCompressed,
				tableProperties,
				path);
	} catch (Exception e) {
		throw new FlinkHiveException(e);
	}
}

Source File: FetcherOutputFormat.java From nutch-htmlunit with Apache License 2.0

4 votes

public RecordWriter<Text, NutchWritable> getRecordWriter(final FileSystem fs,
                                    final JobConf job,
                                    final String name,
                                    final Progressable progress) throws IOException {

  Path out = FileOutputFormat.getOutputPath(job);
  final Path fetch =
    new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), name);
  final Path content =
    new Path(new Path(out, Content.DIR_NAME), name);
  
  final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job);

  final MapFile.Writer fetchOut =
    new MapFile.Writer(job, fs, fetch.toString(), Text.class, CrawlDatum.class,
        compType, progress);
  
  return new RecordWriter<Text, NutchWritable>() {
      private MapFile.Writer contentOut;
      private RecordWriter<Text, Parse> parseOut;

      {
        if (Fetcher.isStoringContent(job)) {
          contentOut = new MapFile.Writer(job, fs, content.toString(),
                                          Text.class, Content.class,
                                          compType, progress);
        }

        if (Fetcher.isParsing(job)) {
          parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, progress);
        }
      }

      public void write(Text key, NutchWritable value)
        throws IOException {

        Writable w = value.get();
        
        if (w instanceof CrawlDatum)
          fetchOut.append(key, w);
        else if (w instanceof Content && contentOut != null)
          contentOut.append(key, w);
        else if (w instanceof Parse && parseOut != null)
          parseOut.write(key, (Parse)w);
      }

      public void close(Reporter reporter) throws IOException {
        fetchOut.close();
        if (contentOut != null) {
          contentOut.close();
        }
        if (parseOut != null) {
          parseOut.close(reporter);
        }
      }

    };

}

Source File: RandomWriter.java From RDFS with Apache License 2.0

4 votes

/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return -1;
  }
  
  Path outDir = new Path(args[0]);
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  
  job.setInputFormat(RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  job.setReducerClass(IdentityReducer.class);
  job.setOutputFormat(SequenceFileOutputFormat.class);
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
  }
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}

Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0

4 votes

public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException {

    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
      LOG.info("CrawlDb db: " + crawlDb);
    }

    Path outFolder = new Path(output);
    Path tempDir =
      new Path(config.get("mapred.temp.dir", ".") +
               "/readdb-topN-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);

    if (LOG.isInfoEnabled()) {
      LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("db.reader.topn", topN);

    FileInputFormat.addInputPath(job, tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // create a single file.

    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); }

  }

Source File: FreeGenerator.java From nutch-htmlunit with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
    System.err.println("\tinputDir\tinput directory containing one or more input files.");
    System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
    System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
    System.err.println("\t-filter\trun current URLFilters on input URLs");
    System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
    return -1;
  }
  boolean filter = false;
  boolean normalize = false;
  if (args.length > 2) {
    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else {
        LOG.error("Unknown argument: " + args[i] + ", exiting ...");
        return -1;
      }
    }
  }

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("FreeGenerator: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());
  job.setBoolean(FILTER_KEY, filter);
  job.setBoolean(NORMALIZE_KEY, normalize);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  job.setInputFormat(TextInputFormat.class);
  job.setMapperClass(FG.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Generator.SelectorEntry.class);
  job.setPartitionerClass(URLPartitioner.class);
  job.setReducerClass(FG.class);
  String segName = Generator.generateSegmentName();
  job.setNumReduceTasks(job.getNumMapTasks());
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);
  job.setOutputKeyComparatorClass(Generator.HashComparator.class);
  FileOutputFormat.setOutputPath(job, new Path(args[1],
      new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
  try {
    JobClient.runJob(job);
  } catch (Exception e) {
    LOG.error("FAILED: " + StringUtils.stringifyException(e));
    return -1;
  }
  long end = System.currentTimeMillis();
  LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}

Source File: NodeDumper.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile)
  throws Exception {

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("NodeDumper: starting at " + sdf.format(start));
  Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
  Configuration conf = getConf();

  JobConf dumper = new NutchJob(conf);
  dumper.setJobName("NodeDumper: " + webGraphDb);
  FileInputFormat.addInputPath(dumper, nodeDb);
  dumper.setInputFormat(SequenceFileInputFormat.class);

  if (nameType == null) {
    dumper.setMapperClass(Sorter.class);
    dumper.setReducerClass(Sorter.class);
    dumper.setMapOutputKeyClass(FloatWritable.class);
    dumper.setMapOutputValueClass(Text.class);
  } else {
    dumper.setMapperClass(Dumper.class);
    dumper.setReducerClass(Dumper.class);
    dumper.setMapOutputKeyClass(Text.class);
    dumper.setMapOutputValueClass(FloatWritable.class);
  }

  dumper.setOutputKeyClass(Text.class);
  dumper.setOutputValueClass(FloatWritable.class);
  FileOutputFormat.setOutputPath(dumper, output);

  if (asSequenceFile) {
    dumper.setOutputFormat(SequenceFileOutputFormat.class);
  } else {
    dumper.setOutputFormat(TextOutputFormat.class);
  }

  dumper.setNumReduceTasks(1);
  dumper.setBoolean("inlinks", type == DumpType.INLINKS);
  dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
  dumper.setBoolean("scores", type == DumpType.SCORES);

  dumper.setBoolean("host", nameType == NameType.HOST);
  dumper.setBoolean("domain", nameType == NameType.DOMAIN);
  dumper.setBoolean("sum", aggrType == AggrType.SUM);
  dumper.setBoolean("max", aggrType == AggrType.MAX);

  dumper.setLong("topn", topN);

  // Set equals-sign as separator for Solr's ExternalFileField
  if (asEff) {
    dumper.set("mapred.textoutputformat.separator", "=");
  }

  try {
    LOG.info("NodeDumper: running");
    JobClient.runJob(dumper);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  long end = System.currentTimeMillis();
  LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

Source File: RandomWriter.java From hadoop-book with Apache License 2.0

4 votes

/**
 * This is the main routine for launching a distributed random write job. It
 * runs 10 maps/node and each node writes 1 gig of data to a DFS file. The
 * reduce doesn't do anything.
 *
 * @throws IOException
 */
public int run(String[] args) throws Exception {
    if (args.length == 0) {
        System.out.println("Usage: writer <out-dir>");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Path outDir = new Path(args[0]);
    JobConf job = new JobConf(getConf());

    job.setJarByClass(RandomWriter.class);
    job.setJobName("random-writer");
    FileOutputFormat.setOutputPath(job, outDir);

    job.setOutputKeyClass(BytesWritable.class);
    job.setOutputValueClass(BytesWritable.class);

    job.setInputFormat(RandomInputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(IdentityReducer.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient client = new JobClient(job);
    ClusterStatus cluster = client.getClusterStatus();
    int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
    long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map",
            1 * 1024 * 1024 * 1024);
    if (numBytesToWritePerMap == 0) {
        System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
        return -2;
    }
    long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes",
            numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers());
    int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
    if (numMaps == 0 && totalBytesToWrite > 0) {
        numMaps = 1;
        job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
    }

    job.setNumMapTasks(numMaps);
    System.out.println("Running " + numMaps + " maps.");

    // reducer NONE
    job.setNumReduceTasks(0);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    JobClient.runJob(job);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + (endTime.getTime() - startTime.getTime()) / 1000
            + " seconds.");

    return 0;
}

Source File: CloudBurst.java From emr-sample-apps with Apache License 2.0

4 votes

public static RunningJob alignall(String refpath, 
		                          String qrypath,
		                          String outpath,
		                          int MIN_READ_LEN,
		                          int MAX_READ_LEN,
		                          int K,
		                          int ALLOW_DIFFERENCES,
		                          boolean FILTER_ALIGNMENTS,
		                          int NUM_MAP_TASKS,
		                          int NUM_REDUCE_TASKS,
		                          int BLOCK_SIZE,
		                          int REDUNDANCY) throws IOException, Exception
{
	int SEED_LEN   = MIN_READ_LEN / (K+1);
	int FLANK_LEN  = MAX_READ_LEN-SEED_LEN+K;
	
	System.out.println("refath: "            + refpath);
	System.out.println("qrypath: "           + qrypath);
	System.out.println("outpath: "           + outpath);
	System.out.println("MIN_READ_LEN: "      + MIN_READ_LEN);
	System.out.println("MAX_READ_LEN: "      + MAX_READ_LEN);
	System.out.println("K: "                 + K);
	System.out.println("SEED_LEN: "          + SEED_LEN);
	System.out.println("FLANK_LEN: "         + FLANK_LEN);
	System.out.println("ALLOW_DIFFERENCES: " + ALLOW_DIFFERENCES);
	System.out.println("FILTER_ALIGNMENTS: " + FILTER_ALIGNMENTS);
	System.out.println("NUM_MAP_TASKS: "     + NUM_MAP_TASKS);
	System.out.println("NUM_REDUCE_TASKS: "  + NUM_REDUCE_TASKS);
	System.out.println("BLOCK_SIZE: "        + BLOCK_SIZE);
	System.out.println("REDUNDANCY: "        + REDUNDANCY);
	
	JobConf conf = new JobConf(MerReduce.class);
	conf.setJobName("CloudBurst");
	conf.setNumMapTasks(NUM_MAP_TASKS);
	conf.setNumReduceTasks(NUM_REDUCE_TASKS);
	
	FileInputFormat.addInputPath(conf, new Path(refpath));
	FileInputFormat.addInputPath(conf, new Path(qrypath));

	conf.set("refpath",           refpath);
	conf.set("qrypath",           qrypath);
	conf.set("MIN_READ_LEN",      Integer.toString(MIN_READ_LEN));
	conf.set("MAX_READ_LEN",      Integer.toString(MAX_READ_LEN));
	conf.set("K",                 Integer.toString(K));
	conf.set("SEED_LEN",          Integer.toString(SEED_LEN));
	conf.set("FLANK_LEN",         Integer.toString(FLANK_LEN));
	conf.set("ALLOW_DIFFERENCES", Integer.toString(ALLOW_DIFFERENCES));
	conf.set("BLOCK_SIZE",        Integer.toString(BLOCK_SIZE));
	conf.set("REDUNDANCY",        Integer.toString(REDUNDANCY));
	conf.set("FILTER_ALIGNMENTS", (FILTER_ALIGNMENTS ? "1" : "0"));
	
	conf.setMapperClass(MapClass.class);
	
	conf.setInputFormat(SequenceFileInputFormat.class);			
	conf.setMapOutputKeyClass(BytesWritable.class);
	conf.setMapOutputValueClass(BytesWritable.class);
	
	conf.setReducerClass(ReduceClass.class);		
	conf.setOutputKeyClass(IntWritable.class);
	conf.setOutputValueClass(BytesWritable.class);
	conf.setOutputFormat(SequenceFileOutputFormat.class);

	Path oPath = new Path(outpath);
	FileOutputFormat.setOutputPath(conf, oPath);
	System.err.println("  Removing old results");
	FileSystem.get(conf).delete(oPath);
	
	RunningJob rj = JobClient.runJob(conf);
	System.err.println("CloudBurst Finished");
	return rj;
}

Source File: RandomTextWriter.java From hadoop-gpu with Apache License 2.0

4 votes

/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormat(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite);
  }
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormat(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}

Source File: PiEstimator.java From hadoop-gpu with Apache License 2.0

4 votes

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi
 */
public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf
    ) throws IOException {
  //setup job conf
  jobConf.setJobName(PiEstimator.class.getSimpleName());

  jobConf.setInputFormat(SequenceFileInputFormat.class);

  jobConf.setOutputKeyClass(BooleanWritable.class);
  jobConf.setOutputValueClass(LongWritable.class);
  jobConf.setOutputFormat(SequenceFileOutputFormat.class);

  jobConf.setMapperClass(PiMapper.class);
  jobConf.setNumMapTasks(numMaps);

  jobConf.setReducerClass(PiReducer.class);
  jobConf.setNumReduceTasks(1);

  // turn off speculative execution, because DFS doesn't handle
  // multiple writers to the same file.
  jobConf.setSpeculativeExecution(false);

  //setup input/output directories
  final Path inDir = new Path(TMP_DIR, "in");
  final Path outDir = new Path(TMP_DIR, "out");
  FileInputFormat.setInputPaths(jobConf, inDir);
  FileOutputFormat.setOutputPath(jobConf, outDir);

  final FileSystem fs = FileSystem.get(jobConf);
  if (fs.exists(TMP_DIR)) {
    throw new IOException("Tmp directory " + fs.makeQualified(TMP_DIR)
        + " already exists.  Please remove it first.");
  }
  if (!fs.mkdirs(inDir)) {
    throw new IOException("Cannot create input directory " + inDir);
  }

  try {
    //generate an input file for each map task
    for(int i=0; i < numMaps; ++i) {
      final Path file = new Path(inDir, "part"+i);
      final LongWritable offset = new LongWritable(i * numPoints);
      final LongWritable size = new LongWritable(numPoints);
      final SequenceFile.Writer writer = SequenceFile.createWriter(
          fs, jobConf, file,
          LongWritable.class, LongWritable.class, CompressionType.NONE);
      try {
        writer.append(offset, size);
      } finally {
        writer.close();
      }
      System.out.println("Wrote input for Map #"+i);
    }

    //start a map/reduce job
    System.out.println("Starting Job");
    final long startTime = System.currentTimeMillis();
    JobClient.runJob(jobConf);
    final double duration = (System.currentTimeMillis() - startTime)/1000.0;
    System.out.println("Job Finished in " + duration + " seconds");

    //read outputs
    Path inFile = new Path(outDir, "reduce-out");
    LongWritable numInside = new LongWritable();
    LongWritable numOutside = new LongWritable();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf);
    try {
      reader.next(numInside, numOutside);
    } finally {
      reader.close();
    }

    //compute estimated value
    return BigDecimal.valueOf(4).setScale(20)
        .multiply(BigDecimal.valueOf(numInside.get()))
        .divide(BigDecimal.valueOf(numMaps))
        .divide(BigDecimal.valueOf(numPoints));
  } finally {
    fs.delete(TMP_DIR, true);
  }
}

Source File: RandomWriter.java From hadoop-gpu with Apache License 2.0

4 votes

/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    System.out.println("Usage: writer <out-dir>");
    ToolRunner.printGenericCommandUsage(System.out);
    return -1;
  }
  
  Path outDir = new Path(args[0]);
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomWriter.class);
  job.setJobName("random-writer");
  FileOutputFormat.setOutputPath(job, outDir);
  
  job.setOutputKeyClass(BytesWritable.class);
  job.setOutputValueClass(BytesWritable.class);
  
  job.setInputFormat(RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  job.setReducerClass(IdentityReducer.class);
  job.setOutputFormat(SequenceFileOutputFormat.class);
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite);
  }
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}

Source File: DataJoinJob.java From hadoop-gpu with Apache License 2.0

4 votes

public static JobConf createDataJoinJob(String args[]) throws IOException {

    String inputDir = args[0];
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
      System.out.println("Using TextInputFormat: " + args[2]);
      inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileOutputFormat: " + args[7]);
      outputFormat = SequenceFileOutputFormat.class;
      outputValueClass = getClassByName(args[7]);
    } else {
      System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
      maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
      jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, DataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);

    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir));
    FileInputFormat.setInputPaths(job, inputDir);

    job.setInputFormat(inputFormat);

    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job,
            SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
  }

Source File: DataJoinJob.java From RDFS with Apache License 2.0

4 votes

public static JobConf createDataJoinJob(String args[]) throws IOException {

    String inputDir = args[0];
    String outputDir = args[1];
    Class inputFormat = SequenceFileInputFormat.class;
    if (args[2].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileInputFormat: " + args[2]);
    } else {
      System.out.println("Using TextInputFormat: " + args[2]);
      inputFormat = TextInputFormat.class;
    }
    int numOfReducers = Integer.parseInt(args[3]);
    Class mapper = getClassByName(args[4]);
    Class reducer = getClassByName(args[5]);
    Class mapoutputValueClass = getClassByName(args[6]);
    Class outputFormat = TextOutputFormat.class;
    Class outputValueClass = Text.class;
    if (args[7].compareToIgnoreCase("text") != 0) {
      System.out.println("Using SequenceFileOutputFormat: " + args[7]);
      outputFormat = SequenceFileOutputFormat.class;
      outputValueClass = getClassByName(args[7]);
    } else {
      System.out.println("Using TextOutputFormat: " + args[7]);
    }
    long maxNumOfValuesPerGroup = 100;
    String jobName = "";
    if (args.length > 8) {
      maxNumOfValuesPerGroup = Long.parseLong(args[8]);
    }
    if (args.length > 9) {
      jobName = args[9];
    }
    Configuration defaults = new Configuration();
    JobConf job = new JobConf(defaults, DataJoinJob.class);
    job.setJobName("DataJoinJob: " + jobName);

    FileSystem fs = FileSystem.get(defaults);
    fs.delete(new Path(outputDir));
    FileInputFormat.setInputPaths(job, inputDir);

    job.setInputFormat(inputFormat);

    job.setMapperClass(mapper);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));
    job.setOutputFormat(outputFormat);
    SequenceFileOutputFormat.setOutputCompressionType(job,
            SequenceFile.CompressionType.BLOCK);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(mapoutputValueClass);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(outputValueClass);
    job.setReducerClass(reducer);

    job.setNumMapTasks(1);
    job.setNumReduceTasks(numOfReducers);
    job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup);
    return job;
  }

Source File: PiEstimator.java From RDFS with Apache License 2.0

4 votes

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi
 */
public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf
    ) throws IOException {
  //setup job conf
  jobConf.setJobName(PiEstimator.class.getSimpleName());

  jobConf.setInputFormat(SequenceFileInputFormat.class);

  jobConf.setOutputKeyClass(BooleanWritable.class);
  jobConf.setOutputValueClass(LongWritable.class);
  jobConf.setOutputFormat(SequenceFileOutputFormat.class);

  jobConf.setMapperClass(PiMapper.class);
  jobConf.setNumMapTasks(numMaps);

  jobConf.setReducerClass(PiReducer.class);
  jobConf.setNumReduceTasks(1);

  // turn off speculative execution, because DFS doesn't handle
  // multiple writers to the same file.
  jobConf.setSpeculativeExecution(false);

  //setup input/output directories
  final Path inDir = new Path(TMP_DIR, "in");
  final Path outDir = new Path(TMP_DIR, "out");
  FileInputFormat.setInputPaths(jobConf, inDir);
  FileOutputFormat.setOutputPath(jobConf, outDir);

  final FileSystem fs = FileSystem.get(jobConf);
  if (fs.exists(TMP_DIR)) {
    throw new IOException("Tmp directory " + fs.makeQualified(TMP_DIR)
        + " already exists.  Please remove it first.");
  }
  if (!fs.mkdirs(inDir)) {
    throw new IOException("Cannot create input directory " + inDir);
  }

  try {
    //generate an input file for each map task
    for(int i=0; i < numMaps; ++i) {
      final Path file = new Path(inDir, "part"+i);
      final LongWritable offset = new LongWritable(i * numPoints);
      final LongWritable size = new LongWritable(numPoints);
      final SequenceFile.Writer writer = SequenceFile.createWriter(
          fs, jobConf, file,
          LongWritable.class, LongWritable.class, CompressionType.NONE);
      try {
        writer.append(offset, size);
      } finally {
        writer.close();
      }
      System.out.println("Wrote input for Map #"+i);
    }

    //start a map/reduce job
    System.out.println("Starting Job");
    final long startTime = System.currentTimeMillis();
    JobClient.runJob(jobConf);
    final double duration = (System.currentTimeMillis() - startTime)/1000.0;
    System.out.println("Job Finished in " + duration + " seconds");

    //read outputs
    Path inFile = new Path(outDir, "reduce-out");
    LongWritable numInside = new LongWritable();
    LongWritable numOutside = new LongWritable();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf);
    try {
      reader.next(numInside, numOutside);
    } finally {
      reader.close();
    }

    //compute estimated value
    return BigDecimal.valueOf(4).setScale(20)
        .multiply(BigDecimal.valueOf(numInside.get()))
        .divide(BigDecimal.valueOf(numMaps))
        .divide(BigDecimal.valueOf(numPoints));
  } finally {
    fs.delete(TMP_DIR, true);
  }
}

Source File: RandomTextWriter.java From RDFS with Apache License 2.0

4 votes

/**
 * This is the main routine for launching a distributed random write job.
 * It runs 10 maps/node and each node writes 1 gig of data to a DFS file.
 * The reduce doesn't do anything.
 * 
 * @throws IOException 
 */
public int run(String[] args) throws Exception {    
  if (args.length == 0) {
    return printUsage();    
  }
  
  JobConf job = new JobConf(getConf());
  
  job.setJarByClass(RandomTextWriter.class);
  job.setJobName("random-text-writer");
  
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(Text.class);
  
  job.setInputFormat(RandomWriter.RandomInputFormat.class);
  job.setMapperClass(Map.class);        
  
  JobClient client = new JobClient(job);
  ClusterStatus cluster = client.getClusterStatus();
  int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10);
  long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map",
                                           1*1024*1024*1024);
  if (numBytesToWritePerMap == 0) {
    System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0");
    return -2;
  }
  long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", 
       numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers());
  int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap);
  if (numMaps == 0 && totalBytesToWrite > 0) {
    numMaps = 1;
    job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite);
  }
  
  Class<? extends OutputFormat> outputFormatClass = 
    SequenceFileOutputFormat.class;
  List<String> otherArgs = new ArrayList<String>();
  for(int i=0; i < args.length; ++i) {
    try {
      if ("-outFormat".equals(args[i])) {
        outputFormatClass = 
          Class.forName(args[++i]).asSubclass(OutputFormat.class);
      } else {
        otherArgs.add(args[i]);
      }
    } catch (ArrayIndexOutOfBoundsException except) {
      System.out.println("ERROR: Required parameter missing from " +
          args[i-1]);
      return printUsage(); // exits
    }
  }

  job.setOutputFormat(outputFormatClass);
  FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0)));
  
  job.setNumMapTasks(numMaps);
  System.out.println("Running " + numMaps + " maps.");
  
  // reducer NONE
  job.setNumReduceTasks(0);
  
  Date startTime = new Date();
  System.out.println("Job started: " + startTime);
  JobClient.runJob(job);
  Date endTime = new Date();
  System.out.println("Job ended: " + endTime);
  System.out.println("The job took " + 
                     (endTime.getTime() - startTime.getTime()) /1000 + 
                     " seconds.");
  
  return 0;
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

4 votes

/**
 * Converts a libsvm text input file into two binary block matrices for features 
 * and labels, and saves these to the specified output files. This call also deletes 
 * existing files at the specified output locations, as well as determines and 
 * writes the meta data files of both output matrices. 
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
 * the libsvm input files in order to ensure consistency with Spark.
 * 
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, 
		String pathX, String pathY, DataCharacteristics mcOutX)
{
	if( !mcOutX.dimsKnown() )
		throw new DMLRuntimeException("Matrix characteristics "
			+ "required to convert sparse input representation.");
	try {
		//cleanup existing output files
		HDFSTool.deleteFileIfExistOnHDFS(pathX);
		HDFSTool.deleteFileIfExistOnHDFS(pathY);
		
		//convert libsvm to labeled points
		int numFeatures = (int) mcOutX.getCols();
		int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
		JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = 
				MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
		
		//append row index and best-effort caching to avoid repeated text parsing
		JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = 
				lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); 
		
		//extract labels and convert to binary block
		DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
		int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
		out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
		out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, FileFormat.BINARY);
		
		//extract data and convert to binary block
		DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
		out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
		out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, FileFormat.BINARY);
		
		//asynchronous cleanup of cached intermediates
		ilpoints.unpersist(false);
	}
	catch(IOException ex) {
		throw new DMLRuntimeException(ex);
	}
}

Source File: NodeDumper.java From anthelion with Apache License 2.0

4 votes

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile)
  throws Exception {

  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("NodeDumper: starting at " + sdf.format(start));
  Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
  Configuration conf = getConf();

  JobConf dumper = new NutchJob(conf);
  dumper.setJobName("NodeDumper: " + webGraphDb);
  FileInputFormat.addInputPath(dumper, nodeDb);
  dumper.setInputFormat(SequenceFileInputFormat.class);

  if (nameType == null) {
    dumper.setMapperClass(Sorter.class);
    dumper.setReducerClass(Sorter.class);
    dumper.setMapOutputKeyClass(FloatWritable.class);
    dumper.setMapOutputValueClass(Text.class);
  } else {
    dumper.setMapperClass(Dumper.class);
    dumper.setReducerClass(Dumper.class);
    dumper.setMapOutputKeyClass(Text.class);
    dumper.setMapOutputValueClass(FloatWritable.class);
  }

  dumper.setOutputKeyClass(Text.class);
  dumper.setOutputValueClass(FloatWritable.class);
  FileOutputFormat.setOutputPath(dumper, output);

  if (asSequenceFile) {
    dumper.setOutputFormat(SequenceFileOutputFormat.class);
  } else {
    dumper.setOutputFormat(TextOutputFormat.class);
  }

  dumper.setNumReduceTasks(1);
  dumper.setBoolean("inlinks", type == DumpType.INLINKS);
  dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
  dumper.setBoolean("scores", type == DumpType.SCORES);

  dumper.setBoolean("host", nameType == NameType.HOST);
  dumper.setBoolean("domain", nameType == NameType.DOMAIN);
  dumper.setBoolean("sum", aggrType == AggrType.SUM);
  dumper.setBoolean("max", aggrType == AggrType.MAX);

  dumper.setLong("topn", topN);

  // Set equals-sign as separator for Solr's ExternalFileField
  if (asEff) {
    dumper.set("mapred.textoutputformat.separator", "=");
  }

  try {
    LOG.info("NodeDumper: running");
    JobClient.runJob(dumper);
  }
  catch (IOException e) {
    LOG.error(StringUtils.stringifyException(e));
    throw e;
  }
  long end = System.currentTimeMillis();
  LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

Source File: FreeGenerator.java From anthelion with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
    System.err.println("\tinputDir\tinput directory containing one or more input files.");
    System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
    System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
    System.err.println("\t-filter\trun current URLFilters on input URLs");
    System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
    return -1;
  }
  boolean filter = false;
  boolean normalize = false;
  if (args.length > 2) {
    for (int i = 2; i < args.length; i++) {
      if (args[i].equals("-filter")) {
        filter = true;
      } else if (args[i].equals("-normalize")) {
        normalize = true;
      } else {
        LOG.error("Unknown argument: " + args[i] + ", exiting ...");
        return -1;
      }
    }
  }
  
  SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
  long start = System.currentTimeMillis();
  LOG.info("FreeGenerator: starting at " + sdf.format(start));

  JobConf job = new NutchJob(getConf());
  job.setBoolean(FILTER_KEY, filter);
  job.setBoolean(NORMALIZE_KEY, normalize);
  FileInputFormat.addInputPath(job, new Path(args[0]));
  job.setInputFormat(TextInputFormat.class);
  job.setMapperClass(FG.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Generator.SelectorEntry.class);
  job.setPartitionerClass(URLPartitioner.class);
  job.setReducerClass(FG.class);
  String segName = Generator.generateSegmentName();
  job.setNumReduceTasks(job.getNumMapTasks());
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(CrawlDatum.class);
  job.setOutputKeyComparatorClass(Generator.HashComparator.class);
  FileOutputFormat.setOutputPath(job, new Path(args[1],
      new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
  try {
    JobClient.runJob(job);
  } catch (Exception e) {
    LOG.error("FAILED: " + StringUtils.stringifyException(e));
    return -1;
  }
  long end = System.currentTimeMillis();
  LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  return 0;
}

Source File: CrawlDbReader.java From anthelion with Apache License 2.0

4 votes

public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException {
  
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
    LOG.info("CrawlDb db: " + crawlDb);
  }
  
  Path outFolder = new Path(output);
  Path tempDir =
    new Path(config.get("mapred.temp.dir", ".") +
             "/readdb-topN-temp-"+
             Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

  JobConf job = new NutchJob(config);
  job.setJobName("topN prepare " + crawlDb);
  FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
  job.setInputFormat(SequenceFileInputFormat.class);
  job.setMapperClass(CrawlDbTopNMapper.class);
  job.setReducerClass(IdentityReducer.class);

  FileOutputFormat.setOutputPath(job, tempDir);
  job.setOutputFormat(SequenceFileOutputFormat.class);
  job.setOutputKeyClass(FloatWritable.class);
  job.setOutputValueClass(Text.class);

  // XXX hmmm, no setFloat() in the API ... :(
  job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
  JobClient.runJob(job); 
  
  if (LOG.isInfoEnabled()) {
    LOG.info("CrawlDb topN: collecting topN scores.");
  }
  job = new NutchJob(config);
  job.setJobName("topN collect " + crawlDb);
  job.setLong("db.reader.topn", topN);

  FileInputFormat.addInputPath(job, tempDir);
  job.setInputFormat(SequenceFileInputFormat.class);
  job.setMapperClass(IdentityMapper.class);
  job.setReducerClass(CrawlDbTopNReducer.class);

  FileOutputFormat.setOutputPath(job, outFolder);
  job.setOutputFormat(TextOutputFormat.class);
  job.setOutputKeyClass(FloatWritable.class);
  job.setOutputValueClass(Text.class);

  job.setNumReduceTasks(1); // create a single file.

  JobClient.runJob(job);
  FileSystem fs = FileSystem.get(config);
  fs.delete(tempDir, true);
  if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); }

}

org.apache.hadoop.mapred.SequenceFileOutputFormat Java Examples