org.apache.hadoop.mapreduce.StatusReporter Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.StatusReporter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WikipediaDataTypeHandler.java    From datawave with Apache License 2.0 6 votes vote down vote up
@Override
public long process(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter)
                throws IOException, InterruptedException {
    
    // Hold some event-specific variables to avoid re-processing
    this.shardId = getShardId(event);
    
    this.eventDataTypeName = event.getDataType().outputName();
    this.eventUid = event.getId().toString();
    
    // write the standard set of keys
    Multimap<BulkIngestKey,Value> keys = super.processBulk(key, event, eventFields, new ContextWrappedStatusReporter(context));
    long count = keys.size();
    contextWriter.write(keys, context);
    
    // gc before we get into the tokenization piece
    keys = null;
    
    // stream the tokens to the context writer here
    StatusReporter reporter = new ContextWrappedStatusReporter(context);
    count += tokenizeEvent(event, context, contextWriter, reporter);
    
    // return the number of records written
    return count;
}
 
Example #2
Source File: ContentIndexCounters.java    From datawave with Apache License 2.0 6 votes vote down vote up
/**
 * Increments the counter denoted by counterName by one. The counter's value will only be written to the context if it exceeds bufferSize
 * 
 * @param groupName
 *            The name of the counter's group
 * @param counterName
 *            The name of the counter to increment
 * @param reporter
 *            The current task's context
 */
public void increment(String groupName, String counterName, StatusReporter reporter) {
    Map<String,AtomicInteger> group = counts.get(groupName);
    if (group == null) {
        group = new HashMap<>();
        counts.put(groupName, group);
    }
    
    if (group.containsKey(counterName)) {
        AtomicInteger val = group.get(counterName);
        
        if (val.get() > bufferSize && reporter != null) {
            reporter.getCounter(groupName, counterName).increment(val.getAndSet(0));
        }
        
        val.incrementAndGet();
    } else {
        group.put(counterName, new AtomicInteger(1));
    }
}
 
Example #3
Source File: ContentIndexCounters.java    From datawave with Apache License 2.0 6 votes vote down vote up
/**
 * Increments the counter denoted by counterName by the given value. The counter's value will only be written to the context if it exceeds bufferSize
 * 
 * @param groupName
 *            The name of the counter's group
 * @param counterName
 *            The name of the counter to increment
 * @param value
 *            The amount to increment the counter by
 * @param reporter
 *            The current task's context
 */
public void incrementValue(String groupName, String counterName, int value, StatusReporter reporter) {
    Map<String,AtomicInteger> group = counts.get(groupName);
    if (group == null) {
        group = new HashMap<>();
        counts.put(groupName, group);
    }
    
    if (group.containsKey(counterName)) {
        AtomicInteger val = group.get(counterName);
        
        if (val.get() > bufferSize && reporter != null) {
            reporter.getCounter(groupName, counterName).increment(val.getAndSet(0));
        }
        
        val.addAndGet(value);
    } else {
        group.put(counterName, new AtomicInteger(1));
    }
}
 
Example #4
Source File: ShardedDataTypeHandler.java    From datawave with Apache License 2.0 6 votes vote down vote up
/**
 * Creates entries for the shard, shardIndex, and shardReverseIndex tables. This method calls the getFieldValues() method to retrieve the field names and
 * values for the shard table, the getGlobalIndexTerms() method to retrieve the field names and values for the shardIndex table, and the
 * getGlobalReverseIndexTerms() method to retrieve the field names and values for the shardReverseIndex table. The benefit to this approach is that
 * subclasses may only have to parse the event object once to calculate all of this information. This method returns null if the Event objects fatalError()
 * method returns true, Else it will return a Multimap of BulkIngestKey to Value pairs
 */
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                StatusReporter reporter) {
    if (event.fatalError()) {
        return null;
    } else {
        if (isReindexEnabled) {
            Multimap<String,NormalizedContentInterface> filteredEventFields = filterByRequestedFields(eventFields);
            if (filteredEventFields.isEmpty()) {
                return HashMultimap.create(); // nothing to do (none of the reindex fields were found)
            }
            eventFields = filteredEventFields;
        }
        
        Multimap<String,NormalizedContentInterface> fields = getShardNamesAndValues(event, eventFields, (null != getShardIndexTableName()),
                        (null != getShardReverseIndexTableName()), reporter);
        
        return createColumns(event, fields, reporter);
    }
}
 
Example #5
Source File: ShardedDataTypeHandler.java    From datawave with Apache License 2.0 6 votes vote down vote up
protected Multimap<BulkIngestKey,Value> createStats(IngestHelperInterface helper, RawRecordContainer event,
                Multimap<String,NormalizedContentInterface> fields, NormalizedContentInterface value, byte[] visibility, byte[] maskedVisibility,
                MaskedFieldHelper maskedFieldHelper, byte[] shardId, Value indexValue, StatusReporter reporter) {
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    // produce cardinality of terms
    values.putAll(createTermIndexColumn(event, value.getIndexedFieldName(), value.getIndexedFieldValue(), visibility, maskedVisibility, maskedFieldHelper,
                    shardId, this.getIndexStatsTableName(), indexValue));
    
    String reverse = new StringBuilder(value.getIndexedFieldValue()).reverse().toString();
    
    values.putAll(createTermIndexColumn(event, value.getIndexedFieldName(), reverse, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    this.getIndexStatsTableName(), indexValue));
    
    return values;
}
 
Example #6
Source File: ShardedDataTypeHandler.java    From datawave with Apache License 2.0 6 votes vote down vote up
/**
 * @param helper
 * @param event
 * @param fields
 * @param value
 * @param visibility
 * @param maskedVisibility
 * @param maskedFieldHelper
 * @param shardId
 * @param indexValue
 * @param reporter
 */
protected Multimap<BulkIngestKey,Value> createForwardIndices(IngestHelperInterface helper, RawRecordContainer event,
                Multimap<String,NormalizedContentInterface> fields, NormalizedContentInterface value, byte[] visibility, byte[] maskedVisibility,
                MaskedFieldHelper maskedFieldHelper, byte[] shardId, Value indexValue, StatusReporter reporter) {
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    String fieldName = value.getIndexedFieldName();
    String fieldValue = value.getIndexedFieldValue();
    // produce field index.
    values.putAll(createShardFieldIndexColumn(event, fieldName, fieldValue, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    createBloomFilter(event, fields, reporter)));
    
    // produce index column
    values.putAll(createTermIndexColumn(event, fieldName, fieldValue, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    this.getShardIndexTableName(), indexValue));
    
    return values;
}
 
Example #7
Source File: ContentIndexingColumnBasedHandler.java    From datawave with Apache License 2.0 6 votes vote down vote up
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                StatusReporter reporter) {
    
    if (event.fatalError()) {
        return null;
    }
    
    this.shardId = getShardId(event);
    this.eventDataTypeName = event.getDataType().outputName();
    this.eventUid = event.getId().toString();
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    // get the typical shard/index information
    values.putAll(super.processBulk(key, event, eventFields, reporter));
    
    flushTokenOffsetCache(event, values);
    
    counters.flush(reporter);
    
    return values;
}
 
Example #8
Source File: ExtendedContentIndexingColumnBasedHandler.java    From datawave with Apache License 2.0 6 votes vote down vote up
protected void buildAllPhrases(ArrayList<Collection<String>> terms, String zone, RawRecordContainer event, int position, BloomFilter alreadyIndexedTerms,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter,
                StatusReporter reporter) throws IOException, InterruptedException {
    if (terms.size() < 2) {
        // An empty list has no tokens/phrases to emit and phrases of length one
        // were already handled
        return;
    }
    StringBuilder sb = new StringBuilder();
    for (String term : terms.get(0)) {
        if (term.length() <= tokenHelper.getTermLengthMinimum()) {
            continue;
        }
        sb.append(term);
        // Need to move the position pointer back by the amount of the phrase lengths
        // accounting for zero-indexing
        completePhrase(sb, terms.subList(1, terms.size()), zone, event, position - (terms.size() - 1), alreadyIndexedTerms, context, contextWriter,
                        reporter);
        
        sb.setLength(0);
    }
}
 
Example #9
Source File: ContentFunctionQueryTest.java    From datawave with Apache License 2.0 6 votes vote down vote up
public Multimap<BulkIngestKey,Value> processContent(final RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                StatusReporter reporter) {
    
    BufferedReader content = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(event.getRawData())));
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    // Process test file
    // CSV file => position,skips,term,score\n
    content.lines().forEach(
                    line -> {
                        String[] parts = line.split(",");
                        TermWeight.Info info = TermWeight.Info.newBuilder().addTermOffset(Integer.parseInt(parts[0]))
                                        .addPrevSkips(Integer.parseInt(parts[1]))
                                        .addScore(TermWeightPosition.positionScoreToTermWeightScore(Float.parseFloat(parts[3])))
                                        .setZeroOffsetMatch(true).build();
                        
                        NormalizedFieldAndValue nfv = new NormalizedFieldAndValue("BODY", parts[2]);
                        getShardFIKey(nfv, event, values);
                        getShardIndexFIKey(nfv, event, values);
                        getTFKey(nfv, event, values, info);
                        
                    });
    
    return values;
}
 
Example #10
Source File: TestGridMixClasses.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes", "unchecked"})
@Test (timeout=10000)
public void testLoadMapper() throws Exception {

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);

  TaskAttemptID taskId = new TaskAttemptID();
  RecordReader<NullWritable, GridmixRecord> reader = new FakeRecordReader();

  LoadRecordGkGrWriter writer = new LoadRecordGkGrWriter();

  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  LoadSplit split = getLoadSplit();

  MapContext<NullWritable, GridmixRecord, GridmixKey, GridmixRecord> mapContext = new MapContextImpl<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>(
          conf, taskId, reader, writer, committer, reporter, split);
  // context
  Context ctx = new WrappedMapper<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>()
          .getMapContext(mapContext);

  reader.initialize(split, ctx);
  ctx.getConfiguration().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  CompressionEmulationUtil.setCompressionEmulationEnabled(
          ctx.getConfiguration(), true);

  LoadJob.LoadMapper mapper = new LoadJob.LoadMapper();
  // setup, map, clean
  mapper.run(ctx);

  Map<GridmixKey, GridmixRecord> data = writer.getData();
  // check result
  assertEquals(2, data.size());

}
 
Example #11
Source File: TaskInputOutputContextImpl.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public TaskInputOutputContextImpl(Configuration conf, TaskAttemptID taskid,
                                  RecordWriter<KEYOUT,VALUEOUT> output,
                                  OutputCommitter committer,
                                  StatusReporter reporter) {
  super(conf, taskid, reporter);
  this.output = output;
  this.committer = committer;
}
 
Example #12
Source File: ShardedDataTypeHandler.java    From datawave with Apache License 2.0 5 votes vote down vote up
protected Multimap<BulkIngestKey,Value> createReverseIndices(IngestHelperInterface helper, RawRecordContainer event,
                Multimap<String,NormalizedContentInterface> fields, NormalizedContentInterface value, byte[] visibility, byte[] maskedVisibility,
                MaskedFieldHelper maskedFieldHelper, byte[] shardId, Value indexValue, StatusReporter reporter) {
    
    Multimap<BulkIngestKey,Value> values = HashMultimap.create();
    
    String fieldName = value.getIndexedFieldName();
    String fieldValue = value.getIndexedFieldValue();
    // produce index column
    values.putAll(createTermIndexColumn(event, fieldName, fieldValue, visibility, maskedVisibility, maskedFieldHelper, shardId,
                    this.getShardReverseIndexTableName(), indexValue));
    
    return values;
}
 
Example #13
Source File: ReduceContextImpl.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public ReduceContextImpl(Configuration conf, TaskAttemptID taskid,
                         RawKeyValueIterator input, 
                         Counter inputKeyCounter,
                         Counter inputValueCounter,
                         RecordWriter<KEYOUT,VALUEOUT> output,
                         OutputCommitter committer,
                         StatusReporter reporter,
                         RawComparator<KEYIN> comparator,
                         Class<KEYIN> keyClass,
                         Class<VALUEIN> valueClass
                        ) throws InterruptedException, IOException{
  super(conf, taskid, output, committer, reporter);
  this.input = input;
  this.inputKeyCounter = inputKeyCounter;
  this.inputValueCounter = inputValueCounter;
  this.comparator = comparator;
  this.serializationFactory = new SerializationFactory(conf);
  this.keyDeserializer = serializationFactory.getDeserializer(keyClass);
  this.keyDeserializer.open(buffer);
  this.valueDeserializer = serializationFactory.getDeserializer(valueClass);
  this.valueDeserializer.open(buffer);
  hasMore = input.next();
  this.keyClass = keyClass;
  this.valueClass = valueClass;
  this.conf = conf;
  this.taskid = taskid;
}
 
Example #14
Source File: MapContextImpl.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public MapContextImpl(Configuration conf, TaskAttemptID taskid,
                      RecordReader<KEYIN,VALUEIN> reader,
                      RecordWriter<KEYOUT,VALUEOUT> writer,
                      OutputCommitter committer,
                      StatusReporter reporter,
                      InputSplit split) {
  super(conf, taskid, writer, committer, reporter);
  this.reader = reader;
  this.split = split;
}
 
Example #15
Source File: SimpleDataTypeHandler.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public Multimap<BulkIngestKey,Value> processBulk(IK key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> fields,
                StatusReporter reporter) {
    String date = df.format(new Date());
    Text table = new Text(TABLE);
    Value value = new Value("1".getBytes());
    
    Multimap<BulkIngestKey,Value> pairs = HashMultimap.create();
    for (Map.Entry<String,NormalizedContentInterface> entry : fields.entries()) {
        BulkIngestKey bik = new BulkIngestKey(table, new Key(date, entry.getKey(), entry.getValue().getEventFieldValue()));
        pairs.put(bik, value);
    }
    
    return pairs;
}
 
Example #16
Source File: ContentIndexCounters.java    From datawave with Apache License 2.0 5 votes vote down vote up
/**
 * Flushes all counter values to the context and sets the values to zero.
 * 
 * @param reporter
 */
public void flush(StatusReporter reporter) {
    if (reporter != null) {
        for (Entry<String,Map<String,AtomicInteger>> countEntry : counts.entrySet()) {
            String groupName = countEntry.getKey();
            for (Entry<String,AtomicInteger> groupEntry : countEntry.getValue().entrySet()) {
                if (groupEntry.getValue().get() > 0) {
                    reporter.getCounter(groupName, groupEntry.getKey()).increment(groupEntry.getValue().getAndSet(0));
                }
            }
        }
    }
}
 
Example #17
Source File: WikipediaDataTypeHandler.java    From datawave with Apache License 2.0 5 votes vote down vote up
/**
 * Tokenize the event, and write all of the shard, shardIndex, and shardReverseIndex keys out to the context
 * 
 * @param event
 * @param context
 * @param contextWriter
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
@Override
protected long tokenizeEvent(RawRecordContainer event, TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context,
                ContextWriter<KEYOUT,VALUEOUT> contextWriter, StatusReporter reporter) throws IOException, InterruptedException {
    
    long count = 0;
    
    final byte[] visibility = flatten(event.getVisibility());
    final byte[] rawData = event.getRawData();
    
    Document root;
    try {
        root = this.parser.parse(new ByteArrayInputStream(rawData));
    } catch (SAXException e) {
        throw new RuntimeException(e);
    }
    
    NodeList revisions = root.getElementsByTagName("revision");
    
    // For each revision, try to find the stuff we want to tokenize
    for (int i = 0; i < revisions.getLength(); i++) {
        Node revision = revisions.item(i);
        NodeList children = revision.getChildNodes();
        for (int j = 0; j < children.getLength(); j++) {
            Node revChild = children.item(j);
            
            if (REVISION_COMMENT.equals(revChild.getNodeName())) {
                count += tokenizeTextNode(revChild.getTextContent(), event, visibility, context, contextWriter, REVISION_COMMENT_FIELD_NAME,
                                REVISION_COMMENT_TOKEN, reporter);
            } else if (REVISION_TEXT.equals(revChild.getNodeName())) {
                count += tokenizeTextNode(revChild.getTextContent(), event, visibility, context, contextWriter, REVISION_TEXT_FIELD_NAME,
                                REVISION_TEXT_TOKEN, reporter);
            }
        }
    }
    
    return count;
}
 
Example #18
Source File: HadoopCompat.java    From stratio-cassandra with Apache License 2.0 5 votes vote down vote up
/**
 * Instantiates MapContext under Hadoop 1 and MapContextImpl under Hadoop 2.
 */
public static MapContext newMapContext(Configuration conf,
                                       TaskAttemptID taskAttemptID,
                                       RecordReader recordReader,
                                       RecordWriter recordWriter,
                                       OutputCommitter outputCommitter,
                                       StatusReporter statusReporter,
                                       InputSplit inputSplit) {
    return (MapContext) newInstance(MAP_CONTEXT_CONSTRUCTOR,
            conf, taskAttemptID, recordReader, recordWriter, outputCommitter,
            statusReporter, inputSplit);
}
 
Example #19
Source File: MapContextImpl.java    From big-c with Apache License 2.0 5 votes vote down vote up
public MapContextImpl(Configuration conf, TaskAttemptID taskid,
                      RecordReader<KEYIN,VALUEIN> reader,
                      RecordWriter<KEYOUT,VALUEOUT> writer,
                      OutputCommitter committer,
                      StatusReporter reporter,
                      InputSplit split) {
  super(conf, taskid, writer, committer, reporter);
  this.reader = reader;
  this.split = split;
}
 
Example #20
Source File: ReduceContextImpl.java    From big-c with Apache License 2.0 5 votes vote down vote up
public ReduceContextImpl(Configuration conf, TaskAttemptID taskid,
                         RawKeyValueIterator input, 
                         Counter inputKeyCounter,
                         Counter inputValueCounter,
                         RecordWriter<KEYOUT,VALUEOUT> output,
                         OutputCommitter committer,
                         StatusReporter reporter,
                         RawComparator<KEYIN> comparator,
                         Class<KEYIN> keyClass,
                         Class<VALUEIN> valueClass
                        ) throws InterruptedException, IOException{
  super(conf, taskid, output, committer, reporter);
  this.input = input;
  this.inputKeyCounter = inputKeyCounter;
  this.inputValueCounter = inputValueCounter;
  this.comparator = comparator;
  this.serializationFactory = new SerializationFactory(conf);
  this.keyDeserializer = serializationFactory.getDeserializer(keyClass);
  this.keyDeserializer.open(buffer);
  this.valueDeserializer = serializationFactory.getDeserializer(valueClass);
  this.valueDeserializer.open(buffer);
  hasMore = input.next();
  this.keyClass = keyClass;
  this.valueClass = valueClass;
  this.conf = conf;
  this.taskid = taskid;
}
 
Example #21
Source File: TaskInputOutputContextImpl.java    From big-c with Apache License 2.0 5 votes vote down vote up
public TaskInputOutputContextImpl(Configuration conf, TaskAttemptID taskid,
                                  RecordWriter<KEYOUT,VALUEOUT> output,
                                  OutputCommitter committer,
                                  StatusReporter reporter) {
  super(conf, taskid, reporter);
  this.output = output;
  this.committer = committer;
}
 
Example #22
Source File: TestGridMixClasses.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes", "unchecked"})
@Test (timeout=10000)
public void testLoadMapper() throws Exception {

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);

  TaskAttemptID taskId = new TaskAttemptID();
  RecordReader<NullWritable, GridmixRecord> reader = new FakeRecordReader();

  LoadRecordGkGrWriter writer = new LoadRecordGkGrWriter();

  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  LoadSplit split = getLoadSplit();

  MapContext<NullWritable, GridmixRecord, GridmixKey, GridmixRecord> mapContext = new MapContextImpl<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>(
          conf, taskId, reader, writer, committer, reporter, split);
  // context
  Context ctx = new WrappedMapper<NullWritable, GridmixRecord, GridmixKey, GridmixRecord>()
          .getMapContext(mapContext);

  reader.initialize(split, ctx);
  ctx.getConfiguration().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  CompressionEmulationUtil.setCompressionEmulationEnabled(
          ctx.getConfiguration(), true);

  LoadJob.LoadMapper mapper = new LoadJob.LoadMapper();
  // setup, map, clean
  mapper.run(ctx);

  Map<GridmixKey, GridmixRecord> data = writer.getData();
  // check result
  assertEquals(2, data.size());

}
 
Example #23
Source File: ExtendedContentIndexingColumnBasedHandler.java    From datawave with Apache License 2.0 5 votes vote down vote up
private void completePhrase(StringBuilder baseTerm, List<Collection<String>> terms, String zone, RawRecordContainer event, int position,
                BloomFilter alreadyIndexedTerms, TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context,
                ContextWriter<KEYOUT,VALUEOUT> contextWriter, StatusReporter reporter) throws IOException, InterruptedException {
    if (terms.isEmpty()) {
        return;
    }
    for (String term : terms.get(0)) {
        if (term == null) {
            continue;
        }
        boolean properLen = term.length() >= tokenHelper.getTermLengthMinimum();
        // Add the current term and emit the phrase if the current term isn't empty
        if (properLen) {
            baseTerm.append(SPACE).append(term);
            
            counters.increment(ContentIndexCounters.PHRASES_PROCESSED_COUNTER, reporter);
            
            processTermAndZone(event, position, new TermAndZone(baseTerm.toString(), zone), alreadyIndexedTerms, context, contextWriter, reporter);
        }
        
        // If we have more terms to add to this phrase, recurse
        if (terms.size() > 1) {
            completePhrase(baseTerm, terms.subList(1, terms.size()), zone, event, position, alreadyIndexedTerms, context, contextWriter, reporter);
        }
        
        // Only remove the space and term if we actually added one
        if (properLen) {
            // Remove the space and the token we appended last
            baseTerm.setLength(baseTerm.length() - 1 - term.length());
        }
    }
}
 
Example #24
Source File: TestGridMixClasses.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"unchecked", "rawtypes"})
@Test (timeout=30000)
public void testSleepMapper() throws Exception {
  SleepJob.SleepMapper test = new SleepJob.SleepMapper();

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  TaskAttemptID taskId = new TaskAttemptID();
  FakeRecordLLReader reader = new FakeRecordLLReader();
  LoadRecordGkNullWriter writer = new LoadRecordGkNullWriter();
  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  SleepSplit split = getSleepSplit();
  MapContext<LongWritable, LongWritable, GridmixKey, NullWritable> mapcontext = new MapContextImpl<LongWritable, LongWritable, GridmixKey, NullWritable>(
          conf, taskId, reader, writer, committer, reporter, split);
  Context context = new WrappedMapper<LongWritable, LongWritable, GridmixKey, NullWritable>()
          .getMapContext(mapcontext);

  long start = System.currentTimeMillis();
  LOG.info("start:" + start);
  LongWritable key = new LongWritable(start + 2000);
  LongWritable value = new LongWritable(start + 2000);
  // should slip 2 sec
  test.map(key, value, context);
  LOG.info("finish:" + System.currentTimeMillis());
  assertTrue(System.currentTimeMillis() >= (start + 2000));

  test.cleanup(context);
  assertEquals(1, writer.getData().size());
}
 
Example #25
Source File: TestGridMixClasses.java    From big-c with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"unchecked", "rawtypes"})
@Test (timeout=30000)
public void testSleepMapper() throws Exception {
  SleepJob.SleepMapper test = new SleepJob.SleepMapper();

  Configuration conf = new Configuration();
  conf.setInt(JobContext.NUM_REDUCES, 2);

  CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
  conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
  TaskAttemptID taskId = new TaskAttemptID();
  FakeRecordLLReader reader = new FakeRecordLLReader();
  LoadRecordGkNullWriter writer = new LoadRecordGkNullWriter();
  OutputCommitter committer = new CustomOutputCommitter();
  StatusReporter reporter = new TaskAttemptContextImpl.DummyReporter();
  SleepSplit split = getSleepSplit();
  MapContext<LongWritable, LongWritable, GridmixKey, NullWritable> mapcontext = new MapContextImpl<LongWritable, LongWritable, GridmixKey, NullWritable>(
          conf, taskId, reader, writer, committer, reporter, split);
  Context context = new WrappedMapper<LongWritable, LongWritable, GridmixKey, NullWritable>()
          .getMapContext(mapcontext);

  long start = System.currentTimeMillis();
  LOG.info("start:" + start);
  LongWritable key = new LongWritable(start + 2000);
  LongWritable value = new LongWritable(start + 2000);
  // should slip 2 sec
  test.map(key, value, context);
  LOG.info("finish:" + System.currentTimeMillis());
  assertTrue(System.currentTimeMillis() >= (start + 2000));

  test.cleanup(context);
  assertEquals(1, writer.getData().size());
}
 
Example #26
Source File: ExtendedContentIndexingColumnBasedHandler.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public long process(KEYIN key, RawRecordContainer event, Multimap<String,NormalizedContentInterface> eventFields,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter)
                throws IOException, InterruptedException {
    
    // Hold some event-specific variables to avoid re-processing
    this.shardId = getShardId(event);
    
    if (tokenHelper.isVerboseShardCounters()) {
        context.getCounter("EVENT_SHARD_ID", new String(this.shardId)).increment(1);
    }
    
    this.eventDataTypeName = event.getDataType().outputName();
    this.eventUid = event.getId().toString();
    
    // write the standard set of keys
    Multimap<BulkIngestKey,Value> keys = super.processBulk(key, event, eventFields, new ContextWrappedStatusReporter(context));
    long count = keys.size();
    contextWriter.write(keys, context);
    
    StatusReporter reporter = new ContextWrappedStatusReporter(context);
    
    // gc before we get into the tokenization piece
    keys = null;
    
    // stream the tokens to the context writer here
    count += tokenizeEvent(event, context, contextWriter, reporter);
    
    // return the number of records written
    return count;
}
 
Example #27
Source File: TaskAttemptContextImpl.java    From hadoop with Apache License 2.0 4 votes vote down vote up
public TaskAttemptContextImpl(Configuration conf, 
    TaskAttemptID taskId, StatusReporter reporter) {
  super(conf, taskId.getJobID());
  this.taskId = taskId;
  this.reporter = reporter;
}
 
Example #28
Source File: CoreSummaryDataTypeHandler.java    From datawave with Apache License 2.0 4 votes vote down vote up
@Override
public Multimap<BulkIngestKey,Value> processBulk(KEYIN key, RawRecordContainer record, Multimap<String,NormalizedContentInterface> fields,
                StatusReporter reporter) {
    IngestHelperInterface iHelper = this.getHelper(record.getDataType());
    return createEntries(record, fields, getVisibility(), System.currentTimeMillis(), iHelper);
}
 
Example #29
Source File: TestResourceUsageEmulators.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Test {@link LoadJob.ResourceUsageMatcherRunner}.
 */
@Test
@SuppressWarnings("unchecked")
public void testResourceUsageMatcherRunner() throws Exception {
  Configuration conf = new Configuration();
  FakeProgressive progress = new FakeProgressive();
  
  // set the resource calculator plugin
  conf.setClass(TTConfig.TT_RESOURCE_CALCULATOR_PLUGIN,
                DummyResourceCalculatorPlugin.class, 
                ResourceCalculatorPlugin.class);
  // set the resources
  // set the resource implementation class
  conf.setClass(ResourceUsageMatcher.RESOURCE_USAGE_EMULATION_PLUGINS, 
                TestResourceUsageEmulatorPlugin.class, 
                ResourceUsageEmulatorPlugin.class);
  
  long currentTime = System.currentTimeMillis();
  
  // initialize the matcher class
  TaskAttemptID id = new TaskAttemptID("test", 1, TaskType.MAP, 1, 1);
  StatusReporter reporter = new DummyReporter(progress);
  TaskInputOutputContext context = 
    new MapContextImpl(conf, id, null, null, null, reporter, null);
  FakeResourceUsageMatcherRunner matcher = 
    new FakeResourceUsageMatcherRunner(context, null);
  
  // check if the matcher initialized the plugin
  String identifier = TestResourceUsageEmulatorPlugin.DEFAULT_IDENTIFIER;
  long initTime = 
    TestResourceUsageEmulatorPlugin.testInitialization(identifier, conf);
  assertTrue("ResourceUsageMatcherRunner failed to initialize the"
             + " configured plugin", initTime > currentTime);
  
  // check the progress
  assertEquals("Progress mismatch in ResourceUsageMatcherRunner", 
               0, progress.getProgress(), 0D);
  
  // call match() and check progress
  progress.setProgress(0.01f);
  currentTime = System.currentTimeMillis();
  matcher.test();
  long emulateTime = 
    TestResourceUsageEmulatorPlugin.testEmulation(identifier, conf);
  assertTrue("ProgressBasedResourceUsageMatcher failed to load and emulate"
             + " the configured plugin", emulateTime > currentTime);
}
 
Example #30
Source File: WikipediaDataTypeHandler.java    From datawave with Apache License 2.0 4 votes vote down vote up
/**
 * Process a term and zone by writting all applicable keys to the context.
 * 
 * @param event
 * @param position
 * @param term
 * @param alreadyIndexedTerms
 * @param context
 * @param contextWriter
 * @param fieldName
 * @param fieldNameToken
 * @param reporter
 * @throws IOException
 * @throws InterruptedException
 */
protected void processTerm(RawRecordContainer event, int position, String term, BloomFilter alreadyIndexedTerms,
                TaskInputOutputContext<KEYIN,? extends RawRecordContainer,KEYOUT,VALUEOUT> context, ContextWriter<KEYOUT,VALUEOUT> contextWriter,
                String fieldName, String fieldNameToken, StatusReporter reporter) throws IOException, InterruptedException {
    
    // Track all tokens (including synonyms) processed
    if (context != null) {
        counters.increment(ContentIndexCounters.ALL_PROCESSED_COUNTER, reporter);
    }
    
    // Normalize the term since it won't be auto-normalized through the eventFields map
    NormalizedFieldAndValue normFnV = new NormalizedFieldAndValue(fieldNameToken, term);
    Set<NormalizedContentInterface> ncis = this.ingestHelper.normalize(normFnV);
    
    for (NormalizedContentInterface nci : ncis) {
        if (!(nci instanceof NormalizedFieldAndValue)) {
            log.warn("Can't handle a " + nci.getClass() + "; must be a NormalizedFieldAndValue.");
        }
        NormalizedFieldAndValue nfv = (NormalizedFieldAndValue) nci;
        byte[] fieldVisibility = getVisibility(event, nfv);
        
        // Build the event column key/value
        createShardEventColumn(event, contextWriter, context, nfv, this.shardId, fieldVisibility);
        
        // Create a index normalized variant of the term and zone for indexing purposes
        TermAndZone indexedTermAndZone = new TermAndZone(nfv.getIndexedFieldValue(), nfv.getIndexedFieldName());
        
        if ((tokenOffsetCache != null) && tokenOffsetCache.containsKey(indexedTermAndZone)) {
            if (log.isDebugEnabled()) {
                log.debug("Not creating index mutations for " + term + " as we've already created mutations for it.");
            }
            counters.increment(ContentIndexCounters.TOKEN_OFFSET_CACHE_EXISTS, reporter);
        } else {
            // create the index
            createShardIndexColumns(event, contextWriter, context, nfv, this.shardId, fieldVisibility);
        }
        
        // Now add the offset to the token offset queue, and if we overflow then output the overflow
        if (tokenOffsetCache != null) {
            OffsetList overflow = tokenOffsetCache.addOffset(indexedTermAndZone, position);
            if (overflow != null) {
                // no need to normalize as that was already done upon insertion into the token offset cache
                NormalizedFieldAndValue overflowNfv = new NormalizedFieldAndValue(overflow.termAndZone.zone, overflow.termAndZone.term);
                byte[] overflowFieldVisibility = getVisibility(event, overflowNfv);
                
                // Build the field index key/value
                createTermFrequencyIndex(event, contextWriter, context, this.shardId, overflowNfv, overflow.offsets, overflowFieldVisibility, false);
                counters.increment(ContentIndexCounters.TOKENIZER_OFFSET_CACHE_OVERFLOWS, reporter);
                counters.incrementValue(ContentIndexCounters.TOKENIZER_OFFSET_CACHE_POSITIONS_OVERFLOWED, overflow.offsets.size(), reporter);
            }
        } else {
            createTermFrequencyIndex(event, contextWriter, context, this.shardId, nfv, Arrays.asList(position), fieldVisibility, false);
        }
    }
}