Java Code Examples for org.apache.hadoop.io.MapFile#Reader

The following examples show how to use org.apache.hadoop.io.MapFile#Reader . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestCodec.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private void codecTestMapFile(Class<? extends CompressionCodec> clazz,
    CompressionType type, int records) throws Exception {
  
  FileSystem fs = FileSystem.get(conf);
  LOG.info("Creating MapFiles with " + records  + 
          " records using codec " + clazz.getSimpleName());
  Path path = new Path(new Path(
      System.getProperty("test.build.data", "/tmp")),
    clazz.getSimpleName() + "-" + type + "-" + records);

  LOG.info("Writing " + path);
  createMapFile(conf, fs, path, clazz.newInstance(), type, records);
  MapFile.Reader reader = new MapFile.Reader(path, conf);
  Text key1 = new Text("002");
  assertNotNull(reader.get(key1, new Text()));
  Text key2 = new Text("004");
  assertNotNull(reader.get(key2, new Text()));
}
 
Example 2
Source File: MapFileReader.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public MapFileReader(List<String> paths, IndexToKey indexToKey, Class<? extends Writable> recordClass)
                throws IOException {

    this.indexToKey = indexToKey;
    this.recordClass = recordClass;
    this.readers = new MapFile.Reader[paths.size()];

    SequenceFile.Reader.Option[] opts = new SequenceFile.Reader.Option[0];

    Configuration config = new Configuration();
    for (int i = 0; i < paths.size(); i++) {
        readers[i] = new MapFile.Reader(new Path(paths.get(i)), config, opts);
        if (readers[i].getValueClass() != recordClass) {
            throw new UnsupportedOperationException("MapFile record class: " + readers[i].getValueClass()
                            + ", but got class " + recordClass + ", path = " + paths.get(i));
        }
    }

    recordIndexesEachReader = indexToKey.initialize(readers, recordClass);
}
 
Example 3
Source File: MapFileOutputFormat.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/** Open the output generated by this format. */
public static MapFile.Reader[] getReaders(FileSystem ignored, Path dir,
                                          Configuration conf)
  throws IOException {
  FileSystem fs = dir.getFileSystem(conf);
  Path[] names = FileUtil.stat2Paths(fs.listStatus(dir));

  // sort names, so that hash partitioning works
  Arrays.sort(names);
  
  MapFile.Reader[] parts = new MapFile.Reader[names.length];
  for (int i = 0; i < names.length; i++) {
    parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
  }
  return parts;
}
 
Example 4
Source File: MapFileOutputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/** Get an entry from output generated by this class. */
public static <K extends WritableComparable, V extends Writable>
Writable getEntry(MapFile.Reader[] readers,
                                Partitioner<K, V> partitioner,
                                K key,
                                V value) throws IOException {
  int part = partitioner.getPartition(key, value, readers.length);
  return readers[part].get(key, value);
}
 
Example 5
Source File: LinkDumper.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args)
  throws Exception {
  
  if (args == null || args.length < 2) {
    System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
    return;
  }

  // open the readers for the linkdump directory
  Configuration conf = NutchConfiguration.create();
  FileSystem fs = FileSystem.get(conf);
  Path webGraphDb = new Path(args[0]);
  String url = args[1];
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
    webGraphDb, DUMP_DIR), conf);

  // get the link nodes for the url
  Text key = new Text(url);
  LinkNodes nodes = new LinkNodes();
  MapFileOutputFormat.getEntry(readers,
    new HashPartitioner<Text, LinkNodes>(), key, nodes);

  // print out the link nodes
  LinkNode[] linkNodesAr = nodes.getLinks();
  System.out.println(url + ":");
  for (LinkNode node : linkNodesAr) {
    System.out.println("  " + node.getUrl() + " - "
      + node.getNode().toString());
  }

  // close the readers
  FSUtils.closeReaders(readers);
}
 
Example 6
Source File: MapFileOutputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/** Get an entry from output generated by this class. */
public static <K extends WritableComparable, V extends Writable>
Writable getEntry(MapFile.Reader[] readers,
                                Partitioner<K, V> partitioner,
                                K key,
                                V value) throws IOException {
  int part = partitioner.getPartition(key, value, readers.length);
  return readers[part].get(key, value);
}
 
Example 7
Source File: SegmentHandler.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/** Open the output generated by this format. */
private MapFile.Reader[] getReaders(String subDir) throws IOException {
  Path dir = new Path(segmentDir, subDir);
  FileSystem fs = dir.getFileSystem(conf);
  Path[] names = FileUtil.stat2Paths(fs.listStatus(dir, SegmentPathFilter.INSTANCE));

  // sort names, so that hash partitioning works
  Arrays.sort(names);
  
  MapFile.Reader[] parts = new MapFile.Reader[names.length];
  for (int i = 0; i < names.length; i++) {
    parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
  }
  return parts;
}
 
Example 8
Source File: TestSegmentMergerCrawlDatums.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Checks the merged segment and removes the stuff again.
 *
 * @param the test directory
 * @param the merged segment
 * @return the final status
 */
protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception  {
  // Get a MapFile reader for the <Text,CrawlDatum> pairs
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf);
  
  Text key = new Text();
  CrawlDatum value = new CrawlDatum();
  byte finalStatus = 0x0;
  
  for (MapFile.Reader reader : readers) {
    while (reader.next(key, value)) {
      LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus()));
      
      // Only consider fetch status
      if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) {
        finalStatus = value.getStatus();
      }
    }
    
    // Close the reader again
    reader.close();
  }

  // Remove the test directory again
  fs.delete(testDir, true);
  
  LOG.info("Final fetch status for: http://nutch.apache.org/ > " + CrawlDatum.getStatusName(finalStatus));

  // Return the final status
  return finalStatus;
}
 
Example 9
Source File: MapFileOutputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
/** Get an entry from output generated by this class. */
public static <K extends WritableComparable<?>, V extends Writable>
    Writable getEntry(MapFile.Reader[] readers, 
    Partitioner<K, V> partitioner, K key, V value) throws IOException {
  int part = partitioner.getPartition(key, value, readers.length);
  return readers[part].get(key, value);
}
 
Example 10
Source File: MapFileOutputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
/** Get an entry from output generated by this class. */
public static <K extends WritableComparable, V extends Writable>
Writable getEntry(MapFile.Reader[] readers,
                                Partitioner<K, V> partitioner,
                                K key,
                                V value) throws IOException {
  int part = partitioner.getPartition(key, value, readers.length);
  return readers[part].get(key, value);
}
 
Example 11
Source File: FSUtils.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Closes a group of MapFile readers.
 * 
 * @param readers The MapFile readers to close.
 * @throws IOException If an error occurs while closing a reader.
 */
public static void closeReaders(MapFile.Reader[] readers)
  throws IOException {
  
  // loop through the readers closing one by one
  if (readers != null) {
    for (int i = 0; i < readers.length; i++) {
      MapFile.Reader reader = readers[i];
      if (reader != null) {
        reader.close();
      }
    }
  }
}
 
Example 12
Source File: LinkDumper.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args)
  throws Exception {
  
  if (args == null || args.length < 2) {
    System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>");
    return;
  }

  // open the readers for the linkdump directory
  Configuration conf = NutchConfiguration.create();
  FileSystem fs = FileSystem.get(conf);
  Path webGraphDb = new Path(args[0]);
  String url = args[1];
  MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(
    webGraphDb, DUMP_DIR), conf);

  // get the link nodes for the url
  Text key = new Text(url);
  LinkNodes nodes = new LinkNodes();
  MapFileOutputFormat.getEntry(readers,
    new HashPartitioner<Text, LinkNodes>(), key, nodes);

  // print out the link nodes
  LinkNode[] linkNodesAr = nodes.getLinks();
  System.out.println(url + ":");
  for (LinkNode node : linkNodesAr) {
    System.out.println("  " + node.getUrl() + " - "
      + node.getNode().toString());
  }

  // close the readers
  FSUtils.closeReaders(readers);
}
 
Example 13
Source File: TestSequenceFile.java    From compiler with Apache License 2.0 5 votes vote down vote up
private static void closeMap(MapFile.Reader map) {
	if (map != null)
		try {
			map.close();
		} catch (final IOException e) {
			e.printStackTrace();
		}
	map = null;
}
 
Example 14
Source File: FileSplit.java    From mrgeo with Apache License 2.0 4 votes vote down vote up
public void generateSplits(Path parent, Configuration conf) throws IOException
{
  List<FileSplitInfo> list = new ArrayList<>();

  // get a Hadoop file system handle
  FileSystem fs = getFileSystem(parent);

  // get the list of paths of the subdirectories of the parent
  Path[] paths = FileUtil.stat2Paths(fs.listStatus(parent));

  Arrays.sort(paths);

  int partition = 0;
  // look inside each subdirectory for a data dir and keep track
  for (Path p : paths)
  {
    Path mapfile = null;
    FileStatus[] dirFiles = fs.listStatus(p);
    for (FileStatus dirFile : dirFiles)
    {
      if (dirFile.getPath().getName().equals("data"))
      {
        mapfile = dirFile.getPath().getParent();
        break;
      }
    }

    if (mapfile != null)
    {
      RasterWritable val = new RasterWritable();
      MapFile.Reader reader = createMapFileReader(conf, mapfile);
      TileIdWritable firstKey = (TileIdWritable) reader.getClosest(new TileIdWritable(0), val);
      TileIdWritable lastKey = (TileIdWritable) reader.getClosest(new TileIdWritable(Long.MAX_VALUE), val, true);
      if (firstKey != null && lastKey != null)
      {
        list.add(new FileSplitInfo(firstKey.get(), lastKey.get(), mapfile.getName(), partition));
      }

      partition++;
    }
  }

  splits = list.toArray(new FileSplitInfo[list.size()]);
}
 
Example 15
Source File: FileSplitTest.java    From mrgeo with Apache License 2.0 4 votes vote down vote up
@Test
@Category(UnitTest.class)
public void testGenerateSplitsFromPath() throws Exception
{
  // Setup a mock directory structure
  Path rootPath = new Path(FileSplitTest.class.getName() + "-testRootPath");
  Path path1 = new Path(rootPath, FileSplitTest.class.getName() + "-testPath1");
  Path path2 = new Path(rootPath, FileSplitTest.class.getName() + "-testPath2");
  Path path3 = new Path(rootPath, FileSplitTest.class.getName() + "-testPath3");
  Path path1_1 = new Path(path1, "notDataDir");
  Path path1_2 = new Path(path1, "data");
  Path path2_1 = new Path(path2, "data");
  Path path3_1 = new Path(path3, "notDataDir");

  // Setup the FileSystem
  FileSystem mockFS = new FileSystemBuilder()
      .fileStatus(rootPath, new FileStatusBuilder().path(path1).build())
      .fileStatus(rootPath, new FileStatusBuilder().path(path2).build())
      .fileStatus(rootPath, new FileStatusBuilder().path(path3).build())
      .fileStatus(path1, new FileStatusBuilder().path(path1_1).build())
      .fileStatus(path1, new FileStatusBuilder().path(path1_2).build())
      .fileStatus(path2, new FileStatusBuilder().path(path2_1).build())
      .fileStatus(path3, new FileStatusBuilder().path(path3_1).build())
      .build();

  // setup map file readers for each of the data directories
  RasterWritable mockValue = new RasterWritable();
  TileIdWritable[] path1Keys = {new TileIdWritable(2L), new TileIdWritable(4L), new TileIdWritable(6L)};
  RasterWritable[] path1Values = {mockValue, mockValue, mockValue};
  TileIdWritable[] path2Keys = {new TileIdWritable(5L), new TileIdWritable(6L), new TileIdWritable(7L)};
  RasterWritable[] path2Values = {mockValue, mockValue, mockValue};
  MapFile.Reader mockMapFileReaderPath1 = new MapFileReaderBuilder()
      .keyClass(TileIdWritable.class)
      .valueClass(RasterWritable.class)
      .keys(path1Keys)
      .values(path1Values)
      .build();

  MapFile.Reader mockMapFileReaderPath2 = new MapFileReaderBuilder()
      .keyClass(TileIdWritable.class)
      .valueClass(RasterWritable.class)
      .keys(path2Keys)
      .values(path2Values)
      .build();

  // Setup a Configuration
  Configuration mockConfiguration = new ConfigurationBuilder().build();


  FileSplit spySubject = new FileSplit();
  subject = spy(spySubject);
  doReturn(mockFS).when(subject).getFileSystem(rootPath);
  doReturn(mockMapFileReaderPath1).when(subject).createMapFileReader(mockConfiguration, path1);
  doReturn(mockMapFileReaderPath2).when(subject).createMapFileReader(mockConfiguration, path2);
  subject.generateSplits(rootPath, mockConfiguration);

  // Verify we got splits for path 1 and 2
  SplitInfo[] splits = subject.getSplits();
  Assert.assertEquals(2, splits.length);
  verifySplit(path1, path1Keys, splits, 0);
  verifySplit(path2, path2Keys, splits, 1);
}
 
Example 16
Source File: MapFileReader.java    From DataVec with Apache License 2.0 4 votes vote down vote up
@Override
public void close() throws IOException {
    for (MapFile.Reader r : readers) {
        r.close();
    }
}
 
Example 17
Source File: LongIndexToKey.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Override
public List<Pair<Long, Long>> initialize(MapFile.Reader[] readers, Class<? extends Writable> valueClass)
                throws IOException {

    List<Pair<Long, Long>> l = new ArrayList<>(readers.length);
    for (MapFile.Reader r : readers) {
        //Get the first and last keys:
        long first = -1;
        long last = -1;

        //First key: no method for this for some inexplicable reason :/
        LongWritable k = new LongWritable();
        Writable v = ReflectionUtils.newInstance(valueClass, null);
        boolean hasNext = r.next(k, v);
        if(!hasNext){
            //This map file is empty - no data
            l.add(new Pair<>(-1L, -1L));
            continue;
        }
        first = k.get();

        //Last key: easy
        r.reset();
        r.finalKey(k);
        last = k.get();

        l.add(new Pair<>(first, last));
    }

    //Check that things are actually contiguous:
    List<Pair<Long, Long>> sorted = new ArrayList<>(l.size());
    for(Pair<Long,Long> p : l){
        if(p.getLeft() >= 0){
            sorted.add(p);
        }
    }
    Collections.sort(sorted, new Comparator<Pair<Long, Long>>() {
        @Override
        public int compare(Pair<Long, Long> o1, Pair<Long, Long> o2) {
            return Long.compare(o1.getFirst(), o2.getFirst());
        }
    });

    if (sorted.size() == 0){
        throw new IllegalStateException("Map file is empty - no data available");
    }
    if (sorted.get(0).getFirst() != 0L) {
        throw new UnsupportedOperationException("Minimum key value is not 0: got " + sorted.get(0).getFirst());
    }

    for (int i = 0; i < sorted.size() - 1; i++) {
        long currLast = sorted.get(i).getSecond();
        long nextFirst = sorted.get(i + 1).getFirst();

        if(nextFirst == -1){
            //Skip empty map file
            continue;
        }

        if (currLast + 1 != nextFirst) {
            throw new IllegalStateException(
                            "Keys are not contiguous between readers: first/last indices (inclusive) " + "are "
                                            + sorted
                                            + ".\n LongIndexKey assumes unique and contiguous LongWritable keys");
        }
    }

    readerIndices = l;
    return readerIndices;
}
 
Example 18
Source File: SegmentHandler.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
private Writable getEntry(MapFile.Reader[] readers, Text url,
                          Writable entry) throws IOException {
  return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
}
 
Example 19
Source File: FileSplit.java    From mrgeo with Apache License 2.0 4 votes vote down vote up
protected MapFile.Reader createMapFileReader(Configuration conf, Path mapfile) throws IOException
{
  return new MapFile.Reader(mapfile, conf);
}
 
Example 20
Source File: SegmentHandler.java    From anthelion with Apache License 2.0 4 votes vote down vote up
private void closeReaders(MapFile.Reader[] readers) throws IOException {
  for (int i = 0; i < readers.length; i++) {
    readers[i].close();
  }
}