org.apache.hadoop.io.MapFile Java Examples

The following examples show how to use org.apache.hadoop.io.MapFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestLinkDbMerger.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private void createLinkDb(Configuration config, FileSystem fs, Path linkdb, TreeMap init) throws Exception {
  LOG.fine("* creating linkdb: " + linkdb);
  Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
  MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, "part-00000").toString(), Text.class, Inlinks.class);
  Iterator it = init.keySet().iterator();
  while (it.hasNext()) {
    String key = (String)it.next();
    Inlinks inlinks = new Inlinks();
    String[] vals = (String[])init.get(key);
    for (int i = 0; i < vals.length; i++) {
      Inlink in = new Inlink(vals[i], vals[i]);
      inlinks.add(in);
    }
    writer.append(new Text(key), inlinks);
  }
  writer.close();
}
 
Example #2
Source File: AbstractMapFileWriter.java    From DataVec with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param outputDir           Output directory for the map file(s)
 * @param mapFileSplitSize    Split size for the map file: if 0, use a single map file for all output. If > 0,
 *                            multiple map files will be used: each will contain a maximum of mapFileSplitSize.
 *                            This can be used to avoid having a single multi gigabyte map file, which may be
 *                            undesirable in some cases (transfer across the network, for example)
 * @param convertTextTo       If null: Make no changes to Text writable objects. If non-null, Text writable instances
 *                            will be converted to this type. This is useful, when would rather store numerical values
 *                            even if the original record reader produces strings/text.
 * @param indexInterval       Index interval for the Map file. Defaults to 1, which is suitable for most cases
 * @param filenamePattern     The naming pattern for the map files. Used with String.format(pattern, int)
 * @param hadoopConfiguration Hadoop configuration.
 */
public AbstractMapFileWriter(@NonNull File outputDir, int mapFileSplitSize, WritableType convertTextTo,
                             int indexInterval, String filenamePattern,
                             org.apache.hadoop.conf.Configuration hadoopConfiguration) {
    if(indexInterval <= 0){
        throw new UnsupportedOperationException("Index interval: must be >= 0 (got: " + indexInterval + ")");
    }
    this.outputDir = outputDir;
    this.mapFileSplitSize = mapFileSplitSize;
    if (convertTextTo == WritableType.Text) {
        convertTextTo = null;
    }
    this.convertTextTo = convertTextTo;
    this.indexInterval = indexInterval;
    this.filenamePattern = filenamePattern;

    this.hadoopConfiguration = hadoopConfiguration;
    if(this.hadoopConfiguration.get(MAP_FILE_INDEX_INTERVAL_KEY) != null){
        this.hadoopConfiguration.set(MAP_FILE_INDEX_INTERVAL_KEY, String.valueOf(indexInterval));
    }

    opts = new SequenceFile.Writer.Option[]{MapFile.Writer.keyClass(KEY_CLASS),
            SequenceFile.Writer.valueClass(getValueClass())};

}
 
Example #3
Source File: MapFileReader.java    From DataVec with Apache License 2.0 6 votes vote down vote up
public MapFileReader(List<String> paths, IndexToKey indexToKey, Class<? extends Writable> recordClass)
                throws IOException {

    this.indexToKey = indexToKey;
    this.recordClass = recordClass;
    this.readers = new MapFile.Reader[paths.size()];

    SequenceFile.Reader.Option[] opts = new SequenceFile.Reader.Option[0];

    Configuration config = new Configuration();
    for (int i = 0; i < paths.size(); i++) {
        readers[i] = new MapFile.Reader(new Path(paths.get(i)), config, opts);
        if (readers[i].getValueClass() != recordClass) {
            throw new UnsupportedOperationException("MapFile record class: " + readers[i].getValueClass()
                            + ", but got class " + recordClass + ", path = " + paths.get(i));
        }
    }

    recordIndexesEachReader = indexToKey.initialize(readers, recordClass);
}
 
Example #4
Source File: TestFileOutputCommitter.java    From big-c with Apache License 2.0 6 votes vote down vote up
private void validateMapFileOutputContent(
    FileSystem fs, Path dir) throws IOException {
  // map output is a directory with index and data files
  Path expectedMapDir = new Path(dir, partFile);
  assert(fs.getFileStatus(expectedMapDir).isDirectory());    
  FileStatus[] files = fs.listStatus(expectedMapDir);
  int fileCount = 0;
  boolean dataFileFound = false; 
  boolean indexFileFound = false; 
  for (FileStatus f : files) {
    if (f.isFile()) {
      ++fileCount;
      if (f.getPath().getName().equals(MapFile.INDEX_FILE_NAME)) {
        indexFileFound = true;
      }
      else if (f.getPath().getName().equals(MapFile.DATA_FILE_NAME)) {
        dataFileFound = true;
      }
    }
  }
  assert(fileCount > 0);
  assert(dataFileFound && indexFileFound);
}
 
Example #5
Source File: TestFileOutputCommitter.java    From big-c with Apache License 2.0 6 votes vote down vote up
private void validateMapFileOutputContent(
    FileSystem fs, Path dir) throws IOException {
  // map output is a directory with index and data files
  Path expectedMapDir = new Path(dir, partFile);
  assert(fs.getFileStatus(expectedMapDir).isDirectory());    
  FileStatus[] files = fs.listStatus(expectedMapDir);
  int fileCount = 0;
  boolean dataFileFound = false; 
  boolean indexFileFound = false; 
  for (FileStatus f : files) {
    if (f.isFile()) {
      ++fileCount;
      if (f.getPath().getName().equals(MapFile.INDEX_FILE_NAME)) {
        indexFileFound = true;
      }
      else if (f.getPath().getName().equals(MapFile.DATA_FILE_NAME)) {
        dataFileFound = true;
      }
    }
  }
  assert(fileCount > 0);
  assert(dataFileFound && indexFileFound);
}
 
Example #6
Source File: SequenceFileInputFormat.java    From big-c with Apache License 2.0 6 votes vote down vote up
@Override
protected List<FileStatus> listStatus(JobContext job
                                      )throws IOException {

  List<FileStatus> files = super.listStatus(job);
  int len = files.size();
  for(int i=0; i < len; ++i) {
    FileStatus file = files.get(i);
    if (file.isDirectory()) {     // it's a MapFile
      Path p = file.getPath();
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      // use the data file
      files.set(i, fs.getFileStatus(new Path(p, MapFile.DATA_FILE_NAME)));
    }
  }
  return files;
}
 
Example #7
Source File: MapFileOutputFormat.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(FileSystem ignored, Path dir,
                                          Configuration conf)
  throws IOException {
  FileSystem fs = dir.getFileSystem(conf);
  Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));

  // sort names, so that hash partitioning works
  Arrays.sort(names);
  
  MapFile.Reader[] parts = new MapFile.Reader[names.length];
  for (int i = 0; i < names.length; i++) {
    parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
  }
  return parts;
}
 
Example #8
Source File: HdfsMrsPyramidOutputFormat.java    From mrgeo with Apache License 2.0 6 votes vote down vote up
@Override
public RecordWriter<WritableComparable<?>, Writable> getRecordWriter(TaskAttemptContext context) throws IOException
{
  CompressionCodec codec = null;
  CompressionType compressionType = CompressionType.NONE;
  if (getCompressOutput(context))
  {
    // find the kind of compression to do
    compressionType = SequenceFileOutputFormat.getOutputCompressionType(context);

    // find the right codec
    codec = getCompressionCodec(context);
  }

  Path file = getDefaultWorkFile(context, "");

  MapFile.Writer out = createMapFileWriter(context, codec, compressionType, file);

  return new Writer(out);
}
 
Example #9
Source File: SeqCombiner.java    From compiler with Apache License 2.0 6 votes vote down vote up
public static long readAndAppendCommit(Configuration conf, FileSystem fileSystem, MapFile.Writer writer, String fileName, long lastAstKey, long lastCommitKey) throws IOException {
	long newLastKey = lastCommitKey;
	SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, new Path(fileName), conf);
	LongWritable longKey = new LongWritable();
	BytesWritable value = new BytesWritable();
	try {
		while (r.next(longKey, value)) {
			newLastKey = longKey.get() + lastCommitKey;
			Revision rev = Revision.parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength()));
			Revision.Builder rb = Revision.newBuilder(rev);
			for (ChangedFile.Builder cfb : rb.getFilesBuilderList()) {
				long key = cfb.getKey();
				if (key > 0)
					cfb.setKey(lastAstKey + key);
			}
			writer.append(new LongWritable(newLastKey), new BytesWritable(rb.build().toByteArray()));
		}
	} catch (Exception e) {
		System.err.println(fileName);
		e.printStackTrace();
	} finally {
		r.close();
	}
	return newLastKey;
}
 
Example #10
Source File: FileSystemManagedLedgerOffloader.java    From pulsar with Apache License 2.0 6 votes vote down vote up
@Override
public CompletableFuture<ReadHandle> readOffloaded(long ledgerId, UUID uuid, Map<String, String> offloadDriverMetadata) {

    CompletableFuture<ReadHandle> promise = new CompletableFuture<>();
    String storagePath = getStoragePath(storageBasePath, offloadDriverMetadata.get(MANAGED_LEDGER_NAME));
    String dataFilePath = getDataFilePath(storagePath, ledgerId, uuid);
    scheduler.chooseThread(ledgerId).submit(() -> {
        try {
            MapFile.Reader reader = new MapFile.Reader(new Path(dataFilePath),
                    configuration);
            promise.complete(FileStoreBackedReadHandleImpl.open(scheduler.chooseThread(ledgerId), reader, ledgerId));
        } catch (Throwable t) {
            log.error("Failed to open FileStoreBackedReadHandleImpl: ManagerLedgerName: {}, " +
                    "LegerId: {}, UUID: {}", offloadDriverMetadata.get(MANAGED_LEDGER_NAME), ledgerId, uuid, t);
            promise.completeExceptionally(t);
        }
    });
    return promise;
}
 
Example #11
Source File: FileStoreBackedReadHandleImpl.java    From pulsar with Apache License 2.0 6 votes vote down vote up
private FileStoreBackedReadHandleImpl(ExecutorService executor, MapFile.Reader reader, long ledgerId) throws IOException {
    this.ledgerId = ledgerId;
    this.executor = executor;
    this.reader = reader;
    LongWritable key = new LongWritable();
    BytesWritable value = new BytesWritable();
    try {
        key.set(FileSystemManagedLedgerOffloader.METADATA_KEY_INDEX);
        reader.get(key, value);
        this.ledgerMetadata = parseLedgerMetadata(value.copyBytes());
    } catch (IOException e) {
        log.error("Fail to read LedgerMetadata for ledgerId {}",
                ledgerId);
        throw new IOException("Fail to read LedgerMetadata for ledgerId " + key.get());
    }
}
 
Example #12
Source File: MapFileOutputFormat.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(FileSystem ignored, Path dir,
                                          Configuration conf)
  throws IOException {
  FileSystem fs = dir.getFileSystem(conf);
  Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));

  // sort names, so that hash partitioning works
  Arrays.sort(names);
  
  MapFile.Reader[] parts = new MapFile.Reader[names.length];
  for (int i = 0; i < names.length; i++) {
    parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
  }
  return parts;
}
 
Example #13
Source File: TestCodec.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private void codecTestMapFile(Class<? extends CompressionCodec> clazz,
    CompressionType type, int records) throws Exception {
  
  FileSystem fs = FileSystem.get(conf);
  LOG.info("Creating MapFiles with " + records  + 
          " records using codec " + clazz.getSimpleName());
  Path path = new Path(new Path(
      System.getProperty("test.build.data", "/tmp")),
    clazz.getSimpleName() + "-" + type + "-" + records);

  LOG.info("Writing " + path);
  createMapFile(conf, fs, path, clazz.newInstance(), type, records);
  MapFile.Reader reader = new MapFile.Reader(path, conf);
  Text key1 = new Text("002");
  assertNotNull(reader.get(key1, new Text()));
  Text key2 = new Text("004");
  assertNotNull(reader.get(key2, new Text()));
}
 
Example #14
Source File: TestFileOutputCommitter.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private void validateMapFileOutputContent(
    FileSystem fs, Path dir) throws IOException {
  // map output is a directory with index and data files
  Path expectedMapDir = new Path(dir, partFile);
  assert(fs.getFileStatus(expectedMapDir).isDirectory());    
  FileStatus[] files = fs.listStatus(expectedMapDir);
  int fileCount = 0;
  boolean dataFileFound = false; 
  boolean indexFileFound = false; 
  for (FileStatus f : files) {
    if (f.isFile()) {
      ++fileCount;
      if (f.getPath().getName().equals(MapFile.INDEX_FILE_NAME)) {
        indexFileFound = true;
      }
      else if (f.getPath().getName().equals(MapFile.DATA_FILE_NAME)) {
        dataFileFound = true;
      }
    }
  }
  assert(fileCount > 0);
  assert(dataFileFound && indexFileFound);
}
 
Example #15
Source File: SequenceFileInputFormat.java    From hadoop with Apache License 2.0 6 votes vote down vote up
@Override
protected List<FileStatus> listStatus(JobContext job
                                      )throws IOException {

  List<FileStatus> files = super.listStatus(job);
  int len = files.size();
  for(int i=0; i < len; ++i) {
    FileStatus file = files.get(i);
    if (file.isDirectory()) {     // it's a MapFile
      Path p = file.getPath();
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      // use the data file
      files.set(i, fs.getFileStatus(new Path(p, MapFile.DATA_FILE_NAME)));
    }
  }
  return files;
}
 
Example #16
Source File: SequenceFileInputFormat.java    From RDFS with Apache License 2.0 6 votes vote down vote up
@Override
protected List<LocatedFileStatus> listLocatedStatus(JobContext job
                                      )throws IOException {

  List<LocatedFileStatus> files = super.listLocatedStatus(job);
  int len = files.size();
  for(int i=0; i < len; ++i) {
    FileStatus file = files.get(i);
    if (file.isDir()) {     // it's a MapFile
      Path p = file.getPath();
      FileSystem fs = p.getFileSystem(job.getConfiguration());
      // use the data file
      files.set(i, fs.listLocatedStatus(
          new Path(p, MapFile.DATA_FILE_NAME)).next());
    }
  }
  return files;
}
 
Example #17
Source File: AbstractMapFileWriter.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param outputDir           Output directory for the map file(s)
 * @param mapFileSplitSize    Split size for the map file: if 0, use a single map file for all output. If > 0,
 *                            multiple map files will be used: each will contain a maximum of mapFileSplitSize.
 *                            This can be used to avoid having a single multi gigabyte map file, which may be
 *                            undesirable in some cases (transfer across the network, for example)
 * @param convertTextTo       If null: Make no changes to Text writable objects. If non-null, Text writable instances
 *                            will be converted to this type. This is useful, when would rather store numerical values
 *                            even if the original record reader produces strings/text.
 * @param indexInterval       Index interval for the Map file. Defaults to 1, which is suitable for most cases
 * @param filenamePattern     The naming pattern for the map files. Used with String.format(pattern, int)
 * @param hadoopConfiguration Hadoop configuration.
 */
public AbstractMapFileWriter(@NonNull File outputDir, int mapFileSplitSize, WritableType convertTextTo,
                             int indexInterval, String filenamePattern,
                             org.apache.hadoop.conf.Configuration hadoopConfiguration) {
    if(indexInterval <= 0){
        throw new UnsupportedOperationException("Index interval: must be >= 0 (got: " + indexInterval + ")");
    }
    this.outputDir = outputDir;
    this.mapFileSplitSize = mapFileSplitSize;
    if (convertTextTo == WritableType.Text) {
        convertTextTo = null;
    }
    this.convertTextTo = convertTextTo;
    this.indexInterval = indexInterval;
    this.filenamePattern = filenamePattern;

    this.hadoopConfiguration = hadoopConfiguration;
    if(this.hadoopConfiguration.get(MAP_FILE_INDEX_INTERVAL_KEY) != null){
        this.hadoopConfiguration.set(MAP_FILE_INDEX_INTERVAL_KEY, String.valueOf(indexInterval));
    }

    opts = new SequenceFile.Writer.Option[]{MapFile.Writer.keyClass(KEY_CLASS),
            SequenceFile.Writer.valueClass(getValueClass())};

}
 
Example #18
Source File: MapFileReader.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public MapFileReader(List<String> paths, IndexToKey indexToKey, Class<? extends Writable> recordClass)
                throws IOException {

    this.indexToKey = indexToKey;
    this.recordClass = recordClass;
    this.readers = new MapFile.Reader[paths.size()];

    SequenceFile.Reader.Option[] opts = new SequenceFile.Reader.Option[0];

    Configuration config = new Configuration();
    for (int i = 0; i < paths.size(); i++) {
        readers[i] = new MapFile.Reader(new Path(paths.get(i)), config, opts);
        if (readers[i].getValueClass() != recordClass) {
            throw new UnsupportedOperationException("MapFile record class: " + readers[i].getValueClass()
                            + ", but got class " + recordClass + ", path = " + paths.get(i));
        }
    }

    recordIndexesEachReader = indexToKey.initialize(readers, recordClass);
}
 
Example #19
Source File: SafeFileOutputCommitterTest.java    From datawave with Apache License 2.0 6 votes vote down vote up
private void validateMapFileOutputContent(FileSystem fs, Path dir) throws IOException {
    // map output is a directory with index and data files
    Path expectedMapDir = new Path(dir, partFile);
    assert (fs.getFileStatus(expectedMapDir).isDirectory());
    FileStatus[] files = fs.listStatus(expectedMapDir);
    int fileCount = 0;
    boolean dataFileFound = false;
    boolean indexFileFound = false;
    for (FileStatus f : files) {
        if (f.isFile()) {
            ++fileCount;
            if (f.getPath().getName().equals(MapFile.INDEX_FILE_NAME)) {
                indexFileFound = true;
            } else if (f.getPath().getName().equals(MapFile.DATA_FILE_NAME)) {
                dataFileFound = true;
            }
        }
    }
    assert (fileCount > 0);
    assert (dataFileFound && indexFileFound);
}
 
Example #20
Source File: SequenceFileInputFormat.java    From RDFS with Apache License 2.0 5 votes vote down vote up
@Override
protected LocatedFileStatus[] listLocatedStatus(JobConf job) throws IOException {
  LocatedFileStatus[] files = super.listLocatedStatus(job);
  for (int i = 0; i < files.length; i++) {
    FileStatus file = files[i];
    if (file.isDir()) {     // it's a MapFile
      Path dataFile = new Path(file.getPath(), MapFile.DATA_FILE_NAME);
      FileSystem fs = file.getPath().getFileSystem(job);
      // use the data file
      files[i] = fs.listLocatedStatus(dataFile).next();
    }
  }
  return files;
}
 
Example #21
Source File: MapFileOutputFormat.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public RecordWriter<WritableComparable, Writable> getRecordWriter(FileSystem ignored, JobConf job,
                                    String name, Progressable progress)
  throws IOException {
  // get the path of the temporary output file 
  Path file = FileOutputFormat.getTaskOutputPath(job, name);
  
  FileSystem fs = file.getFileSystem(job);
  CompressionCodec codec = null;
  CompressionType compressionType = CompressionType.NONE;
  if (getCompressOutput(job)) {
    // find the kind of compression to do
    compressionType = SequenceFileOutputFormat.getOutputCompressionType(job);

    // find the right codec
    Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,
 DefaultCodec.class);
    codec = ReflectionUtils.newInstance(codecClass, job);
  }
  
  // ignore the progress parameter, since MapFile is local
  final MapFile.Writer out =
    new MapFile.Writer(job, fs, file.toString(),
                       job.getOutputKeyClass().asSubclass(WritableComparable.class),
                       job.getOutputValueClass().asSubclass(Writable.class),
                       compressionType, codec,
                       progress);

  return new RecordWriter<WritableComparable, Writable>() {

      public void write(WritableComparable key, Writable value)
        throws IOException {

        out.append(key, value);
      }

      public void close(Reporter reporter) throws IOException { out.close();}
    };
}
 
Example #22
Source File: MapFileOutputFormat.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/** Get an entry from output generated by this class. */
public static <K extends WritableComparable, V extends Writable>
Writable getEntry(MapFile.Reader[] readers,
                                Partitioner<K, V> partitioner,
                                K key,
                                V value) throws IOException {
  int part = partitioner.getPartition(key, value, readers.length);
  return readers[part].get(key, value);
}
 
Example #23
Source File: AbstractMapFileWriter.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public void close() {
    try {
        for (MapFile.Writer w : writers) {
            w.close();
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        isClosed.set(true);
    }
}
 
Example #24
Source File: FSUtils.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Closes a group of MapFile readers.
 * 
 * @param readers The MapFile readers to close.
 * @throws IOException If an error occurs while closing a reader.
 */
public static void closeReaders(MapFile.Reader[] readers)
  throws IOException {
  
  // loop through the readers closing one by one
  if (readers != null) {
    for (int i = 0; i < readers.length; i++) {
      MapFile.Reader reader = readers[i];
      if (reader != null) {
        reader.close();
      }
    }
  }
}
 
Example #25
Source File: SequenceFileInputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
@Override
protected FileStatus[] listStatus(JobConf job) throws IOException {
  FileStatus[] files = super.listStatus(job);
  for (int i = 0; i < files.length; i++) {
    FileStatus file = files[i];
    if (file.isDir()) {     // it's a MapFile
      Path dataFile = new Path(file.getPath(), MapFile.DATA_FILE_NAME);
      FileSystem fs = file.getPath().getFileSystem(job);
      // use the data file
      files[i] = fs.getFileStatus(dataFile);
    }
  }
  return files;
}
 
Example #26
Source File: TestSequenceFile.java    From compiler with Apache License 2.0 5 votes vote down vote up
private static void openMaps(String path) {
	try {
		final Configuration conf = new Configuration();
		final FileSystem fs;
		fs = FileSystem.getLocal(conf);
		astMap = new MapFile.Reader(fs, new Path(path + "/ast").toString(), conf);
		commitMap = new MapFile.Reader(fs, new Path(path + "/commit").toString(), conf);
	} catch (final Exception e) {
		e.printStackTrace();
	}
}
 
Example #27
Source File: MapFileGen.java    From compiler with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	System.out.println("generating data and index file");
	if (SEQ_FILE_PATH.isEmpty()) {
		System.out.println("Missing path to sequence file. Please specify it in the properties file.");
		return;
	}
	Configuration conf = new Configuration();
	FileSystem fs = FileSystem.get(conf);
	for (String name : new String[]{"ast", "commit"}) {
		Path dataFile = new Path(SEQ_FILE_PATH + "/" + name + "/" + MapFile.DATA_FILE_NAME);
		MapFile.fix(fs, dataFile.getParent(), LongWritable.class, BytesWritable.class, false, conf);
	}
	fs.close();
}
 
Example #28
Source File: CrawlDBTestUtil.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Creates synthetic crawldb
 * 
 * @param fs
 *          filesystem where db will be created
 * @param crawldb
 *          path were db will be created
 * @param init
 *          urls to be inserted, objects are of type URLCrawlDatum
 * @throws Exception
 */
public static void createCrawlDb(Configuration conf, FileSystem fs, Path crawldb, List<URLCrawlDatum> init)
    throws Exception {
  LOG.trace("* creating crawldb: " + crawldb);
  Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
  MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir, "part-00000")
      .toString(), Text.class, CrawlDatum.class);
  Iterator<URLCrawlDatum> it = init.iterator();
  while (it.hasNext()) {
    URLCrawlDatum row = it.next();
    LOG.info("adding:" + row.url.toString());
    writer.append(new Text(row.url), row.datum);
  }
  writer.close();
}
 
Example #29
Source File: TestCrawlDbMerger.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb, TreeSet init, CrawlDatum cd) throws Exception {
  LOG.fine("* creating crawldb: " + crawldb);
  Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
  MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, "part-00000").toString(), Text.class, CrawlDatum.class);
  Iterator it = init.iterator();
  while (it.hasNext()) {
    String key = (String)it.next();
    writer.append(new Text(key), cd);
  }
  writer.close();
}
 
Example #30
Source File: TestSegmentMerger.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void testLargeMerge() throws Exception {
  SegmentMerger merger = new SegmentMerger(conf);
  merger.merge(out, new Path[]{seg1, seg2}, false, false, -1);
  // verify output
  FileStatus[] stats = fs.listStatus(out);
  // there should be just one path
  assertEquals(1, stats.length);
  Path outSeg = stats[0].getPath();
  Text k = new Text();
  ParseText v = new ParseText();
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(outSeg, ParseText.DIR_NAME), conf);
  int cnt1 = 0, cnt2 = 0;
  for (MapFile.Reader r : readers) {
    while (r.next(k, v)) {
      String ks = k.toString();
      String vs = v.getText();
      if (ks.startsWith("seg1-")) {
        cnt1++;
        assertTrue(vs.startsWith("seg1 "));
      } else if (ks.startsWith("seg2-")) {
        cnt2++;
        assertTrue(vs.startsWith("seg2 "));
      }
    }
    r.close();
  }
  assertEquals(countSeg1, cnt1);
  assertEquals(countSeg2, cnt2);
}