Java Code Examples for org.apache.hadoop.mapreduce.StatusReporter

The following examples show how to use org.apache.hadoop.mapreduce.StatusReporter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
protected void buildAllPhrases(ArrayList<Collection<String>> terms, String zone, RawRecordContainer event, int position, BloomFilter alreadyIndexedTerms,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter,
                StatusReporter reporter) throws IOException, InterruptedException {
    if (terms.size() < 2) {
        // An empty list has no tokens/phrases to emit and phrases of length one
        // were already handled
        return;
    }
    StringBuilder sb = new StringBuilder();
    for (String term : terms.get(0)) {
        if (term.length() <= tokenHelper.getTermLengthMinimum()) {
            continue;
        }
        sb.append(term);
        // Need to move the position pointer back by the amount of the phrase lengths
        // accounting for zero-indexing
        completePhrase(sb, terms.subList(1, terms.size()), zone, event, position - (terms.size() - 1), alreadyIndexedTerms, context, contextWriter,
                        reporter);
        
        sb.setLength(0);
    }
}
 
Example 2
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                StatusReporter reporter) {
    
    if (event.fatalError()) {
        return null;
    }
    
    this.shardId = getShardId(event);
    this.eventDataTypeName = event.getDataType().outputName();
    this.eventUid = event.getId().toString();
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    // get the typical shard/index information
    values.putAll(super.processBulk(key, event, eventFields, reporter));
    
    flushTokenOffsetCache(event, values);
    
    counters.flush(reporter);
    
    return values;
}
 
Example 3
Source Project: datawave   Source File: ContentIndexCounters.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Increments the counter denoted by counterName by one. The counter's value will only be written to the context if it exceeds bufferSize
 * 
 * @param groupName
 *            The name of the counter's group
 * @param counterName
 *            The name of the counter to increment
 * @param reporter
 *            The current task's context
 */
public void increment(String groupName, String counterName, StatusReporter reporter) {
    Map<String,AtomicInteger> group = counts.get(groupName);
    if (group == null) {
        group = new HashMap<>();
        counts.put(groupName, group);
    }
    
    if (group.containsKey(counterName)) {
        AtomicInteger val = group.get(counterName);
        
        if (val.get() > bufferSize && reporter != null) {
            reporter.getCounter(groupName, counterName).increment(val.getAndSet(0));
        }
        
        val.incrementAndGet();
    } else {
        group.put(counterName, new AtomicInteger(1));
    }
}
 
Example 4
Source Project: datawave   Source File: ContentIndexCounters.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Increments the counter denoted by counterName by the given value. The counter's value will only be written to the context if it exceeds bufferSize
 * 
 * @param groupName
 *            The name of the counter's group
 * @param counterName
 *            The name of the counter to increment
 * @param value
 *            The amount to increment the counter by
 * @param reporter
 *            The current task's context
 */
public void incrementValue(String groupName, String counterName, int value, StatusReporter reporter) {
    Map<String,AtomicInteger> group = counts.get(groupName);
    if (group == null) {
        group = new HashMap<>();
        counts.put(groupName, group);
    }
    
    if (group.containsKey(counterName)) {
        AtomicInteger val = group.get(counterName);
        
        if (val.get() > bufferSize && reporter != null) {
            reporter.getCounter(groupName, counterName).increment(val.getAndSet(0));
        }
        
        val.addAndGet(value);
    } else {
        group.put(counterName, new AtomicInteger(1));
    }
}
 
Example 5
Source Project: datawave   Source File: ShardedDataTypeHandler.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Creates entries for the shard, shardIndex, and shardReverseIndex tables. This method calls the getFieldValues() method to retrieve the field names and
 * values for the shard table, the getGlobalIndexTerms() method to retrieve the field names and values for the shardIndex table, and the
 * getGlobalReverseIndexTerms() method to retrieve the field names and values for the shardReverseIndex table. The benefit to this approach is that
 * subclasses may only have to parse the event object once to calculate all of this information. This method returns null if the Event objects fatalError()
 * method returns true, Else it will return a Multimap of BulkIngestKey to Value pairs
 */
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                StatusReporter reporter) {
    if (event.fatalError()) {
        return null;
    } else {
        if (isReindexEnabled) {
            Multimap<String,NormalizedContentInterface> filteredEventFields = filterByRequestedFields(eventFields);
            if (filteredEventFields.isEmpty()) {
                return HashMultimap.create(); // nothing to do (none of the reindex fields were found)
            }
            eventFields = filteredEventFields;
        }
        
        Multimap<String,NormalizedContentInterface> fields = getShardNamesAndValues(event, eventFields, (null != getShardIndexTableName()),
                        (null != getShardReverseIndexTableName()), reporter);
        
        return createColumns(event, fields, reporter);
    }
}
 
Example 6
Source Project: datawave   Source File: ShardedDataTypeHandler.java    License: Apache License 2.0 6 votes vote down vote up
protected Multimap<BulkIngestKey,Value> createStats(IngestHelperInterface helper, RawRecordContainer event,
                Multimap<String,NormalizedContentInterface> fields, NormalizedContentInterface value, byte[] visibility, byte[] maskedVisibility,
                MaskedFieldHelper maskedFieldHelper, byte[] shardId, Value indexValue, StatusReporter reporter) {
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    // produce cardinality of terms
    values.putAll(createTermIndexColumn(event, value.getIndexedFieldName(), value.getIndexedFieldValue(), visibility, maskedVisibility, maskedFieldHelper,
                    shardId, this.getIndexStatsTableName(), indexValue));
    
    String reverse = new StringBuilder(value.getIndexedFieldValue()).reverse().toString();
    
    values.putAll(createTermIndexColumn(event, value.getIndexedFieldName(), reverse, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    this.getIndexStatsTableName(), indexValue));
    
    return values;
}
 
Example 7
Source Project: datawave   Source File: ShardedDataTypeHandler.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * @param helper
 * @param event
 * @param fields
 * @param value
 * @param visibility
 * @param maskedVisibility
 * @param maskedFieldHelper
 * @param shardId
 * @param indexValue
 * @param reporter
 */
protected Multimap<BulkIngestKey,Value> createForwardIndices(IngestHelperInterface helper, RawRecordContainer event,
                Multimap<String,NormalizedContentInterface> fields, NormalizedContentInterface value, byte[] visibility, byte[] maskedVisibility,
                MaskedFieldHelper maskedFieldHelper, byte[] shardId, Value indexValue, StatusReporter reporter) {
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    String fieldName = value.getIndexedFieldName();
    String fieldValue = value.getIndexedFieldValue();
    // produce field index.
    values.putAll(createShardFieldIndexColumn(event, fieldName, fieldValue, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    createBloomFilter(event, fields, reporter)));
    
    // produce index column
    values.putAll(createTermIndexColumn(event, fieldName, fieldValue, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    this.getShardIndexTableName(), indexValue));
    
    return values;
}
 
Example 8
Source Project: datawave   Source File: ContentFunctionQueryTest.java    License: Apache License 2.0 6 votes vote down vote up
public Multimap<BulkIngestKey,Value> processContent(final RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                StatusReporter reporter) {
    
    BufferedReader content = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(event.getRawData())));
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    // Process test file
    // CSV file => position,skips,term,score\n
    content.lines().forEach(
                    line -> {
                        String[] parts = line.split(",");
                        TermWeight.Info info = TermWeight.Info.newBuilder().addTermOffset(Integer.parseInt(parts[0]))
                                        .addPrevSkips(Integer.parseInt(parts[1]))
                                        .addScore(TermWeightPosition.positionScoreToTermWeightScore(Float.parseFloat(parts[3])))
                                        .setZeroOffsetMatch(true).build();
                        
                        NormalizedFieldAndValue nfv = new NormalizedFieldAndValue("BODY", parts[2]);
                        getShardFIKey(nfv, event, values);
                        getShardIndexFIKey(nfv, event, values);
                        getTFKey(nfv, event, values, info);
                        
                    });
    
    return values;
}
 
Example 9
Source Project: datawave   Source File: WikipediaDataTypeHandler.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public long process(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter)
                throws IOException, InterruptedException {
    
    // Hold some event-specific variables to avoid re-processing
    this.shardId = getShardId(event);
    
    this.eventDataTypeName = event.getDataType().outputName();
    this.eventUid = event.getId().toString();
    
    // write the standard set of keys
    Multimap<BulkIngestKey,Value> keys = super.processBulk(key, event, eventFields, new ContextWrappedStatusReporter(context));
    long count = keys.size();
    contextWriter.write(keys, context);
    
    // gc before we get into the tokenization piece
    keys = null;
    
    // stream the tokens to the context writer here
    StatusReporter reporter = new ContextWrappedStatusReporter(context);
    count += tokenizeEvent(event, context, contextWriter, reporter);
    
    // return the number of records written
    return count;
}
 
Example 10
@Override
public long process(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter)
                throws IOException, InterruptedException {
    
    // Hold some event-specific variables to avoid re-processing
    this.shardId = getShardId(event);
    
    if (tokenHelper.isVerboseShardCounters()) {
        context.getCounter("EVENT_SHARD_ID", new String(this.shardId)).increment(1);
    }
    
    this.eventDataTypeName = event.getDataType().outputName();
    this.eventUid = event.getId().toString();
    
    // write the standard set of keys
    Multimap<BulkIngestKey,Value> keys = super.processBulk(key, event, eventFields, new ContextWrappedStatusReporter(context));
    long count = keys.size();
    contextWriter.write(keys, context);
    
    StatusReporter reporter = new ContextWrappedStatusReporter(context);
    
    // gc before we get into the tokenization piece
    keys = null;
    
    // stream the tokens to the context writer here
    count += tokenizeEvent(event, context, contextWriter, reporter);
    
    // return the number of records written
    return count;
}
 
Example 11
private void completePhrase(StringBuilder baseTerm, List<Collection<String>> terms, String zone, RawRecordContainer event, int position,
                BloomFilter alreadyIndexedTerms, TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context,
                ContextWriter<KEYOUT,VALUEOUT> contextWriter, StatusReporter reporter) throws IOException, InterruptedException {
    if (terms.isEmpty()) {
        return;
    }
    for (String term : terms.get(0)) {
        if (term == null) {
            continue;
        }
        boolean properLen = term.length() >= tokenHelper.getTermLengthMinimum();
        // Add the current term and emit the phrase if the current term isn't empty
        if (properLen) {
            baseTerm.append(SPACE).append(term);
            
            counters.increment(ContentIndexCounters.PHRASES_PROCESSED_COUNTER, reporter);
            
            processTermAndZone(event, position, new TermAndZone(baseTerm.toString(), zone), alreadyIndexedTerms, context, contextWriter, reporter);
        }
        
        // If we have more terms to add to this phrase, recurse
        if (terms.size() > 1) {
            completePhrase(baseTerm, terms.subList(1, terms.size()), zone, event, position, alreadyIndexedTerms, context, contextWriter, reporter);
        }
        
        // Only remove the space and term if we actually added one
        if (properLen) {
            // Remove the space and the token we appended last
            baseTerm.setLength(baseTerm.length() - 1 - term.length());
        }
    }
}
 
Example 12
Source Project: datawave   Source File: ContentIndexCounters.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Flushes all counter values to the context and sets the values to zero.
 * 
 * @param reporter
 */
public void flush(StatusReporter reporter) {
    if (reporter != null) {
        for (Entry<String,Map<String,AtomicInteger>> countEntry : counts.entrySet()) {
            String groupName = countEntry.getKey();
            for (Entry<String,AtomicInteger> groupEntry : countEntry.getValue().entrySet()) {
                if (groupEntry.getValue().get() > 0) {
                    reporter.getCounter(groupName, groupEntry.getKey()).increment(groupEntry.getValue().getAndSet(0));
                }
            }
        }
    }
}
 
Example 13
Source Project: datawave   Source File: ShardedDataTypeHandler.java    License: Apache License 2.0 5 votes vote down vote up
protected Multimap<BulkIngestKey,Value> createReverseIndices(IngestHelperInterface helper, RawRecordContainer event,
                Multimap<String,NormalizedContentInterface> fields, NormalizedContentInterface value, byte[] visibility, byte[] maskedVisibility,
                MaskedFieldHelper maskedFieldHelper, byte[] shardId, Value indexValue, StatusReporter reporter) {
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    String fieldName = value.getIndexedFieldName();
    String fieldValue = value.getIndexedFieldValue();
    // produce index column
    values.putAll(createTermIndexColumn(event, fieldName, fieldValue, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    this.getShardReverseIndexTableName(), indexValue));
    
    return values;
}
 
Example 14
Source Project: datawave   Source File: SimpleDataTypeHandler.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Multimap<BulkIngestKey,Value> processBulk(IK key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> fields,
                StatusReporter reporter) {
    String date = df.format(new Date());
    Text table = new Text(TABLE);
    Value value = new Value("1".getBytes());
    
    Multimap<BulkIngestKey,Value> pairs = HashMultimap.create();
    for (Map.Entry<String,NormalizedContentInterface> entry : fields.entries()) {
        BulkIngestKey bik = new BulkIngestKey(table, new Key(date, entry.getKey(), entry.getValue().getEventFieldValue()));
        pairs.put(bik, value);
    }
    
    return pairs;
}
 
Example 15
Source Project: datawave   Source File: WikipediaDataTypeHandler.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Tokenize the event, and write all of the shard, shardIndex, and shardReverseIndex keys out to the context
 * 
 * @param event
 * @param context
 * @param contextWriter
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
@Override
protected long tokenizeEvent(RawRecordContainer event, TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context,
                ContextWriter<KEYOUT,VALUEOUT> contextWriter, StatusReporter reporter) throws IOException, InterruptedException {
    
    long count = 0;
    
    final byte[] visibility = flatten(event.getVisibility());
    final byte[] rawData = event.getRawData();
    
    Document root;
    try {
        root = this.parser.parse(new ByteArrayInputStream(rawData));
    } catch (SAXException e) {
        throw new RuntimeException(e);
    }
    
    NodeList revisions = root.getElementsByTagName("revision");
    
    // For each revision, try to find the stuff we want to tokenize
    for (int i = 0; i < revisions.getLength(); i++) {
        Node revision = revisions.item(i);
        NodeList children = revision.getChildNodes();
        for (int j = 0; j < children.getLength(); j++) {
            Node revChild = children.item(j);
            
            if (REVISION_COMMENT.equals(revChild.getNodeName())) {
                count += tokenizeTextNode(revChild.getTextContent(), event, visibility, context, contextWriter, REVISION_COMMENT_FIELD_NAME,
                                REVISION_COMMENT_TOKEN, reporter);
            } else if (REVISION_TEXT.equals(revChild.getNodeName())) {
                count += tokenizeTextNode(revChild.getTextContent(), event, visibility, context, contextWriter, REVISION_TEXT_FIELD_NAME,
                                REVISION_TEXT_TOKEN, reporter);
            }
        }
    }
    
    return count;
}
 
Example 16
Source Project: hadoop   Source File: MapContextImpl.java    License: Apache License 2.0 5 votes vote down vote up
public MapContextImpl(Configuration conf, TaskAttemptID taskid,
                      RecordReader<KEYIN,VALUEIN> reader,
                      RecordWriter<KEYOUT,VALUEOUT> writer,
                      OutputCommitter committer,
                      StatusReporter reporter,
                      InputSplit split) {
  super(conf, taskid, writer, committer, reporter);
  this.reader = reader;
  this.split = split;
}
 
Example 17
Source Project: hadoop   Source File: ReduceContextImpl.java    License: Apache License 2.0 5 votes vote down vote up
public ReduceContextImpl(Configuration conf, TaskAttemptID taskid,
                         RawKeyValueIterator input, 
                         Counter inputKeyCounter,
                         Counter inputValueCounter,
                         RecordWriter<KEYOUT,VALUEOUT> output,
                         OutputCommitter committer,
                         StatusReporter reporter,
                         RawComparator<KEYIN> comparator,
                         Class<KEYIN> keyClass,
                         Class<VALUEIN> valueClass
                        ) throws InterruptedException, IOException{
  super(conf, taskid, output, committer, reporter);
  this.input = input;
  this.inputKeyCounter = inputKeyCounter;
  this.inputValueCounter = inputValueCounter;
  this.comparator = comparator;
  this.serializationFactory = new SerializationFactory(conf);
  this.keyDeserializer = serializationFactory.getDeserializer(keyClass);
  this.keyDeserializer.open(buffer);
  this.valueDeserializer = serializationFactory.getDeserializer(valueClass);
  this.valueDeserializer.open(buffer);
  hasMore = input.next();
  this.keyClass = keyClass;
  this.valueClass = valueClass;
  this.conf = conf;
  this.taskid = taskid;
}
 
Example 18
Source Project: hadoop   Source File: TaskInputOutputContextImpl.java    License: Apache License 2.0 5 votes vote down vote up
public TaskInputOutputContextImpl(Configuration conf, TaskAttemptID taskid,
                                  RecordWriter<KEYOUT,VALUEOUT> output,
                                  OutputCommitter committer,
                                  StatusReporter reporter) {
  super(conf, taskid, reporter);
  this.output = output;
  this.committer = committer;
}
 
Example 19
Source Project: hadoop   Source File: TestGridMixClasses.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes", "unchecked"})
@Test (timeout=10000)
public void testLoadMapper() throws Exception {

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);

  TaskAttemptID taskId = new TaskAttemptID();
  RecordReader<NullWritable, GridmixRecord> reader = new FakeRecordReader();

  LoadRecordGkGrWriter writer = new LoadRecordGkGrWriter();

  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  LoadSplit split = getLoadSplit();

  MapContext<NullWritable, GridmixRecord, GridmixKey, GridmixRecord> mapContext = new MapContextImpl<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>(
          conf, taskId, reader, writer, committer, reporter, split);
  // context
  Context ctx = new WrappedMapper<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>()
          .getMapContext(mapContext);

  reader.initialize(split, ctx);
  ctx.getConfiguration().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  CompressionEmulationUtil.setCompressionEmulationEnabled(
          ctx.getConfiguration(), true);

  LoadJob.LoadMapper mapper = new LoadJob.LoadMapper();
  // setup, map, clean
  mapper.run(ctx);

  Map<GridmixKey, GridmixRecord> data = writer.getData();
  // check result
  assertEquals(2, data.size());

}
 
Example 20
Source Project: hadoop   Source File: TestGridMixClasses.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"unchecked", "rawtypes"})
@Test (timeout=30000)
public void testSleepMapper() throws Exception {
  SleepJob.SleepMapper test = new SleepJob.SleepMapper();

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  TaskAttemptID taskId = new TaskAttemptID();
  FakeRecordLLReader reader = new FakeRecordLLReader();
  LoadRecordGkNullWriter writer = new LoadRecordGkNullWriter();
  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  SleepSplit split = getSleepSplit();
  MapContext<LongWritable, LongWritable, GridmixKey, NullWritable> mapcontext = new MapContextImpl<LongWritable, LongWritable, GridmixKey, NullWritable>(
          conf, taskId, reader, writer, committer, reporter, split);
  Context context = new WrappedMapper<LongWritable, LongWritable, GridmixKey, NullWritable>()
          .getMapContext(mapcontext);

  long start = System.currentTimeMillis();
  LOG.info("start:" + start);
  LongWritable key = new LongWritable(start + 2000);
  LongWritable value = new LongWritable(start + 2000);
  // should slip 2 sec
  test.map(key, value, context);
  LOG.info("finish:" + System.currentTimeMillis());
  assertTrue(System.currentTimeMillis() >= (start + 2000));

  test.cleanup(context);
  assertEquals(1, writer.getData().size());
}
 
Example 21
Source Project: big-c   Source File: MapContextImpl.java    License: Apache License 2.0 5 votes vote down vote up
public MapContextImpl(Configuration conf, TaskAttemptID taskid,
                      RecordReader<KEYIN,VALUEIN> reader,
                      RecordWriter<KEYOUT,VALUEOUT> writer,
                      OutputCommitter committer,
                      StatusReporter reporter,
                      InputSplit split) {
  super(conf, taskid, writer, committer, reporter);
  this.reader = reader;
  this.split = split;
}
 
Example 22
Source Project: big-c   Source File: ReduceContextImpl.java    License: Apache License 2.0 5 votes vote down vote up
public ReduceContextImpl(Configuration conf, TaskAttemptID taskid,
                         RawKeyValueIterator input, 
                         Counter inputKeyCounter,
                         Counter inputValueCounter,
                         RecordWriter<KEYOUT,VALUEOUT> output,
                         OutputCommitter committer,
                         StatusReporter reporter,
                         RawComparator<KEYIN> comparator,
                         Class<KEYIN> keyClass,
                         Class<VALUEIN> valueClass
                        ) throws InterruptedException, IOException{
  super(conf, taskid, output, committer, reporter);
  this.input = input;
  this.inputKeyCounter = inputKeyCounter;
  this.inputValueCounter = inputValueCounter;
  this.comparator = comparator;
  this.serializationFactory = new SerializationFactory(conf);
  this.keyDeserializer = serializationFactory.getDeserializer(keyClass);
  this.keyDeserializer.open(buffer);
  this.valueDeserializer = serializationFactory.getDeserializer(valueClass);
  this.valueDeserializer.open(buffer);
  hasMore = input.next();
  this.keyClass = keyClass;
  this.valueClass = valueClass;
  this.conf = conf;
  this.taskid = taskid;
}
 
Example 23
Source Project: big-c   Source File: TaskInputOutputContextImpl.java    License: Apache License 2.0 5 votes vote down vote up
public TaskInputOutputContextImpl(Configuration conf, TaskAttemptID taskid,
                                  RecordWriter<KEYOUT,VALUEOUT> output,
                                  OutputCommitter committer,
                                  StatusReporter reporter) {
  super(conf, taskid, reporter);
  this.output = output;
  this.committer = committer;
}
 
Example 24
Source Project: big-c   Source File: TestGridMixClasses.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes", "unchecked"})
@Test (timeout=10000)
public void testLoadMapper() throws Exception {

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);

  TaskAttemptID taskId = new TaskAttemptID();
  RecordReader<NullWritable, GridmixRecord> reader = new FakeRecordReader();

  LoadRecordGkGrWriter writer = new LoadRecordGkGrWriter();

  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  LoadSplit split = getLoadSplit();

  MapContext<NullWritable, GridmixRecord, GridmixKey, GridmixRecord> mapContext = new MapContextImpl<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>(
          conf, taskId, reader, writer, committer, reporter, split);
  // context
  Context ctx = new WrappedMapper<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>()
          .getMapContext(mapContext);

  reader.initialize(split, ctx);
  ctx.getConfiguration().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  CompressionEmulationUtil.setCompressionEmulationEnabled(
          ctx.getConfiguration(), true);

  LoadJob.LoadMapper mapper = new LoadJob.LoadMapper();
  // setup, map, clean
  mapper.run(ctx);

  Map<GridmixKey, GridmixRecord> data = writer.getData();
  // check result
  assertEquals(2, data.size());

}
 
Example 25
Source Project: big-c   Source File: TestGridMixClasses.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"unchecked", "rawtypes"})
@Test (timeout=30000)
public void testSleepMapper() throws Exception {
  SleepJob.SleepMapper test = new SleepJob.SleepMapper();

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  TaskAttemptID taskId = new TaskAttemptID();
  FakeRecordLLReader reader = new FakeRecordLLReader();
  LoadRecordGkNullWriter writer = new LoadRecordGkNullWriter();
  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  SleepSplit split = getSleepSplit();
  MapContext<LongWritable, LongWritable, GridmixKey, NullWritable> mapcontext = new MapContextImpl<LongWritable, LongWritable, GridmixKey, NullWritable>(
          conf, taskId, reader, writer, committer, reporter, split);
  Context context = new WrappedMapper<LongWritable, LongWritable, GridmixKey, NullWritable>()
          .getMapContext(mapcontext);

  long start = System.currentTimeMillis();
  LOG.info("start:" + start);
  LongWritable key = new LongWritable(start + 2000);
  LongWritable value = new LongWritable(start + 2000);
  // should slip 2 sec
  test.map(key, value, context);
  LOG.info("finish:" + System.currentTimeMillis());
  assertTrue(System.currentTimeMillis() >= (start + 2000));

  test.cleanup(context);
  assertEquals(1, writer.getData().size());
}
 
Example 26
Source Project: stratio-cassandra   Source File: HadoopCompat.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Instantiates MapContext under Hadoop 1 and MapContextImpl under Hadoop 2.
 */
public static MapContext newMapContext(Configuration conf,
                                       TaskAttemptID taskAttemptID,
                                       RecordReader recordReader,
                                       RecordWriter recordWriter,
                                       OutputCommitter outputCommitter,
                                       StatusReporter statusReporter,
                                       InputSplit inputSplit) {
    return (MapContext) newInstance(MAP_CONTEXT_CONSTRUCTOR,
            conf, taskAttemptID, recordReader, recordWriter, outputCommitter,
            statusReporter, inputSplit);
}
 
Example 27
Source Project: datawave   Source File: ErrorShardedDataTypeHandler.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                StatusReporter reporter) {
    throw new UnsupportedOperationException("processBulk is not supported, please use process");
}
 
Example 28
Source Project: datawave   Source File: ErrorDataTypeHandler.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> fields,
                StatusReporter reporter) {
    throw new UnsupportedOperationException("processBulk is not supported, please use process");
}
 
Example 29
Source Project: datawave   Source File: ProtobufEdgeDataTypeHandler.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> fields,
                StatusReporter reporter) {
    throw new UnsupportedOperationException("processBulk is not supported, please use process");
}
 
Example 30
/**
 * Process a term and zone by writting all applicable keys to the context.
 * 
 * @param event
 * @param position
 * @param termAndZone
 * @param alreadyIndexedTerms
 * @param context
 * @param contextWriter
 * @param reporter
 * @throws IOException
 * @throws InterruptedException
 */
private void processTermAndZone(RawRecordContainer event, int position, TermAndZone termAndZone, BloomFilter alreadyIndexedTerms,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter,
                StatusReporter reporter) throws IOException, InterruptedException {
    
    // Make sure the term length is greater than the minimum allowed length
    if (termAndZone.term.length() < tokenHelper.getTermLengthMinimum()) {
        log.debug("Ignoring token of length " + termAndZone.term.length() + " because it is too short");
        counters.increment(ContentIndexCounters.TOO_SHORT_COUNTER, reporter);
        return;
    }
    
    // Track all tokens (including synonyms) processed
    counters.increment(ContentIndexCounters.ALL_PROCESSED_COUNTER, reporter);
    
    // Normalize the term since it won't be auto-normalized through the eventFields map
    NormalizedFieldAndValue normFnV = new NormalizedFieldAndValue(termAndZone.zone, termAndZone.term);
    Set<NormalizedContentInterface> ncis = this.ingestHelper.normalize(normFnV);
    // nfv = (NormalizedFieldAndValue) this.ingestHelper.normalize(nfv);
    
    for (NormalizedContentInterface nci : ncis) {
        if (!(nci instanceof NormalizedFieldAndValue)) {
            log.warn("Can't handle a " + nci.getClass() + "; must be a NormalizedFieldAndValue.");
        }
        NormalizedFieldAndValue nfv = (NormalizedFieldAndValue) nci;
        byte[] fieldVisibility = getVisibility(event, nfv);
        
        // Build the event column key/value
        createShardEventColumn(event, contextWriter, context, nfv, this.shardId, fieldVisibility);
        
        // Create a index normalized variant of the term and zone for indexing purposes
        TermAndZone indexedTermAndZone = new TermAndZone(nfv.getIndexedFieldValue(), nfv.getIndexedFieldName());
        
        org.apache.hadoop.util.bloom.Key alreadySeen = null;
        if ((alreadyIndexedTerms != null)
                        && alreadyIndexedTerms.membershipTest(alreadySeen = new org.apache.hadoop.util.bloom.Key(indexedTermAndZone.getToken().getBytes()))) {
            if (log.isDebugEnabled()) {
                log.debug("Not creating index mutations for " + termAndZone + " as we've already created mutations for it.");
            }
            counters.increment(ContentIndexCounters.BLOOM_FILTER_EXISTS, reporter);
        } else if ((tokenOffsetCache != null) && tokenOffsetCache.containsKey(indexedTermAndZone)) {
            if (log.isDebugEnabled()) {
                log.debug("Not creating index mutations for " + termAndZone + " as we've already created mutations for it.");
            }
            counters.increment(ContentIndexCounters.TOKEN_OFFSET_CACHE_EXISTS, reporter);
        } else {
            // create the index
            createShardIndexColumns(event, contextWriter, context, nfv, this.shardId, fieldVisibility);
            
            if (alreadyIndexedTerms != null) {
                alreadyIndexedTerms.add(alreadySeen);
                counters.increment(ContentIndexCounters.BLOOM_FILTER_ADDED, reporter);
            }
        }
        
        // Now add the offset to the token offset queue, and if we overflow then output the overflow
        if (tokenOffsetCache != null) {
            OffsetList<Integer> overflow = tokenOffsetCache.addOffset(indexedTermAndZone, position);
            if (overflow != null) {
                // no need to normalize as that was already done upon insertion into the token offset cache
                NormalizedFieldAndValue overflowNfv = new NormalizedFieldAndValue(overflow.termAndZone.zone, overflow.termAndZone.term);
                byte[] overflowFieldVisibility = getVisibility(event, overflowNfv);
                
                // Build the field index key/value
                createTermFrequencyIndex(event, contextWriter, context, this.shardId, overflowNfv, overflow.offsets, overflowFieldVisibility,
                                this.ingestHelper.getDeleteMode());
                counters.increment(ContentIndexCounters.TOKENIZER_OFFSET_CACHE_OVERFLOWS, reporter);
                counters.incrementValue(ContentIndexCounters.TOKENIZER_OFFSET_CACHE_POSITIONS_OVERFLOWED, overflow.offsets.size(), reporter);
            }
        } else {
            createTermFrequencyIndex(event, contextWriter, context, this.shardId, nfv, Arrays.asList(position), fieldVisibility,
                            this.ingestHelper.getDeleteMode());
        }
    }
}