Java Code Examples for org.apache.hadoop.mapred.Reporter#progress()

The following examples show how to use org.apache.hadoop.mapred.Reporter#progress() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CqlInputFormat.java    From stratio-cassandra with Apache License 2.0 6 votes vote down vote up
public RecordReader<Long, Row> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter)
        throws IOException
{
    TaskAttemptContext tac = new TaskAttemptContext(jobConf, TaskAttemptID.forName(jobConf.get(MAPRED_TASK_ID)))
    {
        @Override
        public void progress()
        {
            reporter.progress();
        }
    };

    CqlRecordReader recordReader = new CqlRecordReader();
    recordReader.initialize((org.apache.hadoop.mapreduce.InputSplit)split, tac);
    return recordReader;
}
 
Example 2
Source File: HadoopArchives.java    From RDFS with Apache License 2.0 6 votes vote down vote up
public void reduce(IntWritable key, Iterator<Text> values,
    OutputCollector<Text, Text> out,
    Reporter reporter) throws IOException {
  keyVal = key.get();
  while(values.hasNext()) {
    Text value = values.next();
    String towrite = value.toString() + "\n";
    indexStream.write(towrite.getBytes());
    written++;
    if (written > numIndexes -1) {
      // every 1000 indexes we report status
      reporter.setStatus("Creating index for archives");
      reporter.progress();
      endIndex = keyVal;
      String masterWrite = startIndex + " " + endIndex + " " + startPos 
                          +  " " + indexStream.getPos() + " \n" ;
      outStream.write(masterWrite.getBytes());
      startPos = indexStream.getPos();
      startIndex = endIndex;
      written = 0;
    }
  }
}
 
Example 3
Source File: HadoopArchives.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public void reduce(IntWritable key, Iterator<Text> values,
    OutputCollector<Text, Text> out,
    Reporter reporter) throws IOException {
  keyVal = key.get();
  while(values.hasNext()) {
    Text value = values.next();
    String towrite = value.toString() + "\n";
    indexStream.write(towrite.getBytes(Charsets.UTF_8));
    written++;
    if (written > numIndexes -1) {
      // every 1000 indexes we report status
      reporter.setStatus("Creating index for archives");
      reporter.progress();
      endIndex = keyVal;
      String masterWrite = startIndex + " " + endIndex + " " + startPos 
                          +  " " + indexStream.getPos() + " \n" ;
      outStream.write(masterWrite.getBytes(Charsets.UTF_8));
      startPos = indexStream.getPos();
      startIndex = endIndex;
      written = 0;
    }
  }
}
 
Example 4
Source File: HiveDynamoDBInputFormat.java    From emr-dynamodb-connector with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<Text, DynamoDBItemWritable> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws
    IOException {
  reporter.progress();

  Map<String, String> columnMapping =
      HiveDynamoDBUtil.fromJsonString(conf.get(DynamoDBConstants.DYNAMODB_COLUMN_MAPPING));
  Map<String, String> hiveTypeMapping = HiveDynamoDBUtil.extractHiveTypeMapping(conf);
  DynamoDBQueryFilter queryFilter = getQueryFilter(conf, columnMapping, hiveTypeMapping);
  DynamoDBSplit bbSplit = (DynamoDBSplit) split;
  bbSplit.setDynamoDBFilterPushdown(queryFilter);

  Collection<String> attributes = (columnMapping == null ? null : columnMapping.values());
  DynamoDBRecordReaderContext context = buildHiveDynamoDBRecordReaderContext(bbSplit, conf,
      reporter, attributes);
  return new DefaultDynamoDBRecordReader(context);
}
 
Example 5
Source File: HadoopArchives.java    From big-c with Apache License 2.0 6 votes vote down vote up
public void reduce(IntWritable key, Iterator<Text> values,
    OutputCollector<Text, Text> out,
    Reporter reporter) throws IOException {
  keyVal = key.get();
  while(values.hasNext()) {
    Text value = values.next();
    String towrite = value.toString() + "\n";
    indexStream.write(towrite.getBytes(Charsets.UTF_8));
    written++;
    if (written > numIndexes -1) {
      // every 1000 indexes we report status
      reporter.setStatus("Creating index for archives");
      reporter.progress();
      endIndex = keyVal;
      String masterWrite = startIndex + " " + endIndex + " " + startPos 
                          +  " " + indexStream.getPos() + " \n" ;
      outStream.write(masterWrite.getBytes(Charsets.UTF_8));
      startPos = indexStream.getPos();
      startIndex = endIndex;
      written = 0;
    }
  }
}
 
Example 6
Source File: HadoopArchives.java    From big-c with Apache License 2.0 5 votes vote down vote up
public void map(LongWritable key, HarEntry value,
    OutputCollector<IntWritable, Text> out,
    Reporter reporter) throws IOException {
  Path relPath = new Path(value.path);
  int hash = HarFileSystem.getHarHash(relPath);
  String towrite = null;
  Path srcPath = realPath(relPath, rootPath);
  long startPos = partStream.getPos();
  FileSystem srcFs = srcPath.getFileSystem(conf);
  FileStatus srcStatus = srcFs.getFileStatus(srcPath);
  String propStr = encodeProperties(srcStatus);
  if (value.isDir()) { 
    towrite = encodeName(relPath.toString())
              + " dir " + propStr + " 0 0 ";
    StringBuffer sbuff = new StringBuffer();
    sbuff.append(towrite);
    for (String child: value.children) {
      sbuff.append(encodeName(child) + " ");
    }
    towrite = sbuff.toString();
    //reading directories is also progress
    reporter.progress();
  }
  else {
    FSDataInputStream input = srcFs.open(srcStatus.getPath());
    reporter.setStatus("Copying file " + srcStatus.getPath() + 
        " to archive.");
    copyData(srcStatus.getPath(), input, partStream, reporter);
    towrite = encodeName(relPath.toString())
              + " file " + partname + " " + startPos
              + " " + srcStatus.getLen() + " " + propStr + " ";
  }
  out.collect(new IntWritable(hash), new Text(towrite));
}
 
Example 7
Source File: HadoopArchives.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public void map(LongWritable key, HarEntry value,
    OutputCollector<IntWritable, Text> out,
    Reporter reporter) throws IOException {
  Path relPath = new Path(value.path);
  int hash = HarFileSystem.getHarHash(relPath);
  String towrite = null;
  Path srcPath = realPath(relPath, rootPath);
  long startPos = partStream.getPos();
  FileSystem srcFs = srcPath.getFileSystem(conf);
  FileStatus srcStatus = srcFs.getFileStatus(srcPath);
  String propStr = URLEncoder.encode(
                      srcStatus.getModificationTime() + " "
                    + srcStatus.getAccessTime() + " "
                    + srcStatus.getPermission().toShort() + " "
                    + URLEncoder.encode(srcStatus.getOwner(), "UTF-8") + " "
                    + URLEncoder.encode(srcStatus.getGroup(), "UTF-8"),
                   "UTF-8");
  if (value.isDir()) { 
    towrite = URLEncoder.encode(relPath.toString(),"UTF-8")  
              + " dir " + propStr + " 0 0 ";
    StringBuffer sbuff = new StringBuffer();
    sbuff.append(towrite);
    for (String child: value.children) {
      sbuff.append(URLEncoder.encode(child,"UTF-8") + " ");
    }
    towrite = sbuff.toString();
    //reading directories is also progress
    reporter.progress();
  }
  else {
    FSDataInputStream input = srcFs.open(srcStatus.getPath());
    reporter.setStatus("Copying file " + srcStatus.getPath() + 
        " to archive.");
    copyData(srcStatus.getPath(), input, partStream, reporter);
    towrite = URLEncoder.encode(relPath.toString(),"UTF-8")
              + " file " + partname + " " + startPos
              + " " + srcStatus.getLen() + " " + propStr + " ";
  }
  out.collect(new IntWritable(hash), new Text(towrite));
}
 
Example 8
Source File: HadoopArchives.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public void copyData(Path input, FSDataInputStream fsin, 
    FSDataOutputStream fout, Reporter reporter) throws IOException {
  try {
    for (int cbread=0; (cbread = fsin.read(buffer))>= 0;) {
      fout.write(buffer, 0,cbread);
      reporter.progress();
    }
  } finally {
    fsin.close();
  }
}
 
Example 9
Source File: UDFWithOptions.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
protected static void reportProgress(@Nonnull Reporter reporter) {
    if (reporter != null) {
        synchronized (reporter) {
            reporter.progress();
        }
    }
}
 
Example 10
Source File: UDTFWithOptions.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
protected static void reportProgress(@Nullable Reporter reporter) {
    if (reporter != null) {
        synchronized (reporter) {
            reporter.progress();
        }
    }
}
 
Example 11
Source File: DynamoDBFibonacciRetryer.java    From emr-dynamodb-connector with Apache License 2.0 5 votes vote down vote up
private void incrementRetryCounter(Reporter reporter, PrintCounter retryCounter) {
  if (reporter != null) {
    if (retryCounter != null) {
      reporter.incrCounter(retryCounter.getGroup(), retryCounter.getName(), 1);
    } else {
      reporter.progress();
    }
  }
}
 
Example 12
Source File: HadoopArchives.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
public void map(LongWritable key, Text value,
    OutputCollector<IntWritable, Text> out,
    Reporter reporter) throws IOException {
  String line  = value.toString();
  MapStat mstat = new MapStat(line);
  Path srcPath = new Path(mstat.pathname);
  String towrite = null;
  Path relPath = makeRelative(srcPath);
  int hash = HarFileSystem.getHarHash(relPath);
  long startPos = partStream.getPos();
  if (mstat.isDir) { 
    towrite = relPath.toString() + " " + "dir none " + 0 + " " + 0 + " ";
    StringBuffer sbuff = new StringBuffer();
    sbuff.append(towrite);
    for (String child: mstat.children) {
      sbuff.append(child + " ");
    }
    towrite = sbuff.toString();
    //reading directories is also progress
    reporter.progress();
  }
  else {
    FileSystem srcFs = srcPath.getFileSystem(conf);
    FileStatus srcStatus = srcFs.getFileStatus(srcPath);
    FSDataInputStream input = srcFs.open(srcStatus.getPath());
    reporter.setStatus("Copying file " + srcStatus.getPath() + 
        " to archive.");
    copyData(srcStatus.getPath(), input, partStream, reporter);
    towrite = relPath.toString() + " file " + partname + " " + startPos
    + " " + srcStatus.getLen() + " ";
  }
  out.collect(new IntWritable(hash), new Text(towrite));
}
 
Example 13
Source File: HadoopArchives.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
public void copyData(Path input, FSDataInputStream fsin, 
    FSDataOutputStream fout, Reporter reporter) throws IOException {
  try {
    for (int cbread=0; (cbread = fsin.read(buffer))>= 0;) {
      fout.write(buffer, 0,cbread);
      reporter.progress();
    }
  } finally {
    fsin.close();
  }
}
 
Example 14
Source File: OnDiskMapOutput.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Override
public void shuffle(MapHost host, InputStream input,
                    long compressedLength, long decompressedLength,
                    ShuffleClientMetrics metrics,
                    Reporter reporter) throws IOException {
  input = new IFileInputStream(input, compressedLength, conf);
  // Copy data to local-disk
  long bytesLeft = compressedLength;
  try {
    final int BYTES_TO_READ = 64 * 1024;
    byte[] buf = new byte[BYTES_TO_READ];
    while (bytesLeft > 0) {
      int n = ((IFileInputStream)input).readWithChecksum(buf, 0, (int) Math.min(bytesLeft, BYTES_TO_READ));
      if (n < 0) {
        throw new IOException("read past end of stream reading " + 
                              getMapId());
      }
      disk.write(buf, 0, n);
      bytesLeft -= n;
      metrics.inputBytes(n);
      reporter.progress();
    }

    LOG.info("Read " + (compressedLength - bytesLeft) + 
             " bytes from map-output for " + getMapId());

    disk.close();
  } catch (IOException ioe) {
    // Close the streams
    IOUtils.cleanup(LOG, input, disk);

    // Re-throw
    throw ioe;
  }

  // Sanity check
  if (bytesLeft != 0) {
    throw new IOException("Incomplete map output received for " +
                          getMapId() + " from " +
                          host.getHostName() + " (" + 
                          bytesLeft + " bytes missing of " + 
                          compressedLength + ")");
  }
  this.compressedSize = compressedLength;
}
 
Example 15
Source File: Loops.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * Performs a single loop pass looking for loop cycles within routes. If
 * This is not the last loop cycle then url will be mapped for further
 * passes.
 */
public void reduce(Text key, Iterator<ObjectWritable> values,
  OutputCollector<Text, Route> output, Reporter reporter)
  throws IOException {

  List<Route> routeList = new ArrayList<Route>();
  Set<String> outlinkUrls = new LinkedHashSet<String>();
  int numValues = 0;

  // aggregate all routes and outlinks for a given url
  while (values.hasNext()) {
    ObjectWritable next = values.next();
    Object value = next.get();
    if (value instanceof Route) {
      routeList.add(WritableUtils.clone((Route)value, conf));
    }
    else if (value instanceof Text) {
      String outlinkUrl = ((Text)value).toString();
      if (!outlinkUrls.contains(outlinkUrl)) {
        outlinkUrls.add(outlinkUrl);
      }
    }

    // specify progress, could be a lot of routes
    numValues++;
    if (numValues % 100 == 0) {
      reporter.progress();
    }
  }

  // loop through the route list
  Iterator<Route> routeIt = routeList.listIterator();
  while (routeIt.hasNext()) {

    // removing the route for space concerns, could be a lot of routes
    // if the route is already found, meaning it is a loop just collect it
    // urls with no outlinks that are not found will fall off
    Route route = routeIt.next();
    routeIt.remove();
    if (route.isFound()) {
      output.collect(key, route);
    }
    else {

      // if the route start url is found, set route to found and collect
      String lookingFor = route.getLookingFor();
      if (outlinkUrls.contains(lookingFor)) {
        route.setFound(true);
        output.collect(key, route);
      }
      else if (!last) {

        // setup for next pass through the loop
        for (String outlink : outlinkUrls) {
          output.collect(new Text(outlink), route);
        }
      }
    }
  }
}
 
Example 16
Source File: TestShufflePlugin.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test
/**
 * A testing method verifying availability and accessibility of API that is needed
 * for sub-classes of ShuffleConsumerPlugin
 */
public void testConsumerApi() {

  JobConf jobConf = new JobConf();
  ShuffleConsumerPlugin<K, V> shuffleConsumerPlugin = new TestShuffleConsumerPlugin<K, V>();

  //mock creation
  ReduceTask mockReduceTask = mock(ReduceTask.class);
  TaskUmbilicalProtocol mockUmbilical = mock(TaskUmbilicalProtocol.class);
  Reporter mockReporter = mock(Reporter.class);
  FileSystem mockFileSystem = mock(FileSystem.class);
  Class<? extends org.apache.hadoop.mapred.Reducer>  combinerClass = jobConf.getCombinerClass();
  @SuppressWarnings("unchecked")  // needed for mock with generic
  CombineOutputCollector<K, V>  mockCombineOutputCollector =
    (CombineOutputCollector<K, V>) mock(CombineOutputCollector.class);
  org.apache.hadoop.mapreduce.TaskAttemptID mockTaskAttemptID =
    mock(org.apache.hadoop.mapreduce.TaskAttemptID.class);
  LocalDirAllocator mockLocalDirAllocator = mock(LocalDirAllocator.class);
  CompressionCodec mockCompressionCodec = mock(CompressionCodec.class);
  Counter mockCounter = mock(Counter.class);
  TaskStatus mockTaskStatus = mock(TaskStatus.class);
  Progress mockProgress = mock(Progress.class);
  MapOutputFile mockMapOutputFile = mock(MapOutputFile.class);
  Task mockTask = mock(Task.class);

  try {
    String [] dirs = jobConf.getLocalDirs();
    // verify that these APIs are available through super class handler
    ShuffleConsumerPlugin.Context<K, V> context =
   new ShuffleConsumerPlugin.Context<K, V>(mockTaskAttemptID, jobConf, mockFileSystem,
                                              mockUmbilical, mockLocalDirAllocator,
                                              mockReporter, mockCompressionCodec,
                                              combinerClass, mockCombineOutputCollector,
                                              mockCounter, mockCounter, mockCounter,
                                              mockCounter, mockCounter, mockCounter,
                                              mockTaskStatus, mockProgress, mockProgress,
                                              mockTask, mockMapOutputFile, null);
    shuffleConsumerPlugin.init(context);
    shuffleConsumerPlugin.run();
    shuffleConsumerPlugin.close();
  }
  catch (Exception e) {
    assertTrue("Threw exception:" + e, false);
  }

  // verify that these APIs are available for 3rd party plugins
  mockReduceTask.getTaskID();
  mockReduceTask.getJobID();
  mockReduceTask.getNumMaps();
  mockReduceTask.getPartition();
  mockReporter.progress();
}
 
Example 17
Source File: OnDiskMapOutput.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Override
public void shuffle(MapHost host, InputStream input,
                    long compressedLength, long decompressedLength,
                    ShuffleClientMetrics metrics,
                    Reporter reporter) throws IOException {
  input = new IFileInputStream(input, compressedLength, conf);
  // Copy data to local-disk
  long bytesLeft = compressedLength;
  try {
    final int BYTES_TO_READ = 64 * 1024;
    byte[] buf = new byte[BYTES_TO_READ];
    while (bytesLeft > 0) {
      int n = ((IFileInputStream)input).readWithChecksum(buf, 0, (int) Math.min(bytesLeft, BYTES_TO_READ));
      if (n < 0) {
        throw new IOException("read past end of stream reading " + 
                              getMapId());
      }
      disk.write(buf, 0, n);
      bytesLeft -= n;
      metrics.inputBytes(n);
      reporter.progress();
    }

    LOG.info("Read " + (compressedLength - bytesLeft) + 
             " bytes from map-output for " + getMapId());

    disk.close();
  } catch (IOException ioe) {
    // Close the streams
    IOUtils.cleanup(LOG, input, disk);

    // Re-throw
    throw ioe;
  }

  // Sanity check
  if (bytesLeft != 0) {
    throw new IOException("Incomplete map output received for " +
                          getMapId() + " from " +
                          host.getHostName() + " (" + 
                          bytesLeft + " bytes missing of " + 
                          compressedLength + ")");
  }
  this.compressedSize = compressedLength;
}
 
Example 18
Source File: HiveCassandraStandardColumnInputFormat.java    From Hive-Cassandra with Apache License 2.0 4 votes vote down vote up
@Override
public RecordReader<BytesWritable, MapWritable> getRecordReader(InputSplit split,
    JobConf jobConf, final Reporter reporter) throws IOException {
  HiveCassandraStandardSplit cassandraSplit = (HiveCassandraStandardSplit) split;

  List<String> columns = AbstractColumnSerDe.parseColumnMapping(cassandraSplit.getColumnMapping());
  isTransposed = AbstractColumnSerDe.isTransposed(columns);


  List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf);

  if (columns.size() < readColIDs.size()) {
    throw new IOException("Cannot read more columns than the given table contains.");
  }

  org.apache.cassandra.hadoop.ColumnFamilySplit cfSplit = cassandraSplit.getSplit();
  Job job = new Job(jobConf);

  TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) {
    @Override
    public void progress() {
      reporter.progress();
    }
  };

  SlicePredicate predicate = new SlicePredicate();

  if (isTransposed || readColIDs.size() == columns.size() || readColIDs.size() == 0) {
    SliceRange range = new SliceRange();
    AbstractType comparator = BytesType.instance;

    String comparatorType = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_COMPARATOR);
    if (comparatorType != null && !comparatorType.equals("")) {
      try {
        comparator = TypeParser.parse(comparatorType);
      } catch (Exception ex) {
        throw new IOException("Comparator class not found.");
      }
    }

    String sliceStart = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_START);
    String sliceEnd = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_FINISH);
    String reversed = jobConf.get(AbstractColumnSerDe.CASSANDRA_SLICE_PREDICATE_RANGE_REVERSED);

    range.setStart(comparator.fromString(sliceStart == null ? "" : sliceStart));
    range.setFinish(comparator.fromString(sliceEnd == null ? "" : sliceEnd));
    range.setReversed(reversed == null ? false : reversed.equals("true"));
    range.setCount(cassandraSplit.getSlicePredicateSize());
    predicate.setSlice_range(range);
  } else {
    int iKey = columns.indexOf(AbstractColumnSerDe.CASSANDRA_KEY_COLUMN);
    predicate.setColumn_names(getColumnNames(iKey, columns, readColIDs));
  }


  try {
    ConfigHelper.setInputColumnFamily(tac.getConfiguration(),
        cassandraSplit.getKeyspace(), cassandraSplit.getColumnFamily());

    ConfigHelper.setInputSlicePredicate(tac.getConfiguration(), predicate);
    ConfigHelper.setRangeBatchSize(tac.getConfiguration(), cassandraSplit.getRangeBatchSize());
    ConfigHelper.setInputRpcPort(tac.getConfiguration(), cassandraSplit.getPort() + "");
    ConfigHelper.setInputInitialAddress(tac.getConfiguration(), cassandraSplit.getHost());
    ConfigHelper.setInputPartitioner(tac.getConfiguration(), cassandraSplit.getPartitioner());
    // Set Split Size
    ConfigHelper.setInputSplitSize(tac.getConfiguration(), cassandraSplit.getSplitSize());

    CassandraHiveRecordReader rr = null;

    if(isTransposed && tac.getConfiguration().getBoolean(AbstractColumnSerDe.CASSANDRA_ENABLE_WIDEROW_ITERATOR, true)) {
      rr = new CassandraHiveRecordReader(new ColumnFamilyWideRowRecordReader(), isTransposed);
    } else {
      rr = new CassandraHiveRecordReader(new ColumnFamilyRecordReader(), isTransposed);
    }
    rr.initialize(cfSplit, tac);

    return rr;

  } catch (Exception ie) {
    throw new IOException(ie);
  }
}
 
Example 19
Source File: ArcSegmentCreator.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * <p>Runs the Map job to translate an arc record into output for Nutch 
 * segments.</p>
 * 
 * @param key The arc record header.
 * @param bytes The arc record raw content bytes.
 * @param output The output collecter.
 * @param reporter The progress reporter.
 */
public void map(Text key, BytesWritable bytes,
  OutputCollector<Text, NutchWritable> output, Reporter reporter)
  throws IOException {

  String[] headers = key.toString().split("\\s+");
  String urlStr = headers[0];
  String version = headers[2];
  String contentType = headers[3];
  
  // arcs start with a file description.  for now we ignore this as it is not
  // a content record
  if (urlStr.startsWith("filedesc://")) {
    LOG.info("Ignoring file header: " + urlStr);
    return;
  }
  LOG.info("Processing: " + urlStr);

  // get the raw  bytes from the arc file, create a new crawldatum
  Text url = new Text();
  CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
    1.0f);
  String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);

  // normalize and filter the urls
  try {
    urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
    urlStr = urlFilters.filter(urlStr); // filter the url
  }
  catch (Exception e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("Skipping " + url + ":" + e);
    }
    urlStr = null;
  }

  // if still a good url then process
  if (urlStr != null) {

    url.set(urlStr);
    try {

      // set the protocol status to success and the crawl status to success
      // create the content from the normalized url and the raw bytes from
      // the arc file,  TODO: currently this doesn't handle text of errors
      // pages (i.e. 404, etc.). We assume we won't get those.
      ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
      Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
        new Metadata(), getConf());
      
      // set the url version into the metadata
      content.getMetadata().set(URL_VERSION, version);
      ParseStatus pstatus = null;
      pstatus = output(output, segmentName, url, datum, content, status,
        CrawlDatum.STATUS_FETCH_SUCCESS);
      reporter.progress();
    }
    catch (Throwable t) { // unexpected exception
      logError(url, t);
      output(output, segmentName, url, datum, null, null,
        CrawlDatum.STATUS_FETCH_RETRY);
    }
  }
}
 
Example 20
Source File: InMemoryMapOutput.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Override
public void shuffle(MapHost host, InputStream input,
                    long compressedLength, long decompressedLength,
                    ShuffleClientMetrics metrics,
                    Reporter reporter) throws IOException {
  IFileInputStream checksumIn = 
    new IFileInputStream(input, compressedLength, conf);

  input = checksumIn;       

  // Are map-outputs compressed?
  if (codec != null) {
    decompressor.reset();
    input = codec.createInputStream(input, decompressor);
  }

  try {
    IOUtils.readFully(input, memory, 0, memory.length);
    metrics.inputBytes(memory.length);
    reporter.progress();
    LOG.info("Read " + memory.length + " bytes from map-output for " +
              getMapId());

    /**
     * We've gotten the amount of data we were expecting. Verify the
     * decompressor has nothing more to offer. This action also forces the
     * decompressor to read any trailing bytes that weren't critical
     * for decompression, which is necessary to keep the stream
     * in sync.
     */
    if (input.read() >= 0 ) {
      throw new IOException("Unexpected extra bytes from input stream for " +
                             getMapId());
    }

  } catch (IOException ioe) {      
    // Close the streams
    IOUtils.cleanup(LOG, input);

    // Re-throw
    throw ioe;
  } finally {
    CodecPool.returnDecompressor(decompressor);
  }
}