Java Code Examples for org.apache.hadoop.io.Text#find()

The following examples show how to use org.apache.hadoop.io.Text#find() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ShardedTableTabletBalancer.java    From datawave with Apache License 2.0 6 votes vote down vote up
private byte[] retrieveDate(KeyExtent extent) {
    Text endRow = extent.getEndRow();
    if (endRow == null)
        endRow = extent.getPrevEndRow();
    if (endRow == null) {
        log.warn("Attempting to retrieve date from empty extent " + extent + ". Is your sharded table pre-split?");
        return "null".getBytes();
    } else {
        int idx = endRow.find("_");
        if (idx <= 0) {
            idx = endRow.getLength();
            log.warn("Extent " + extent + " does not conform to sharded date scheme yyyyMMdd_num");
        }
        return Arrays.copyOf(endRow.getBytes(), idx);
    }
}
 
Example 2
Source File: AncestorIndexBuildingVisitor.java    From datawave with Apache License 2.0 6 votes vote down vote up
/**
 * Extract the uid from an event key, format shardId dataType\0UID FieldName\0FieldValue NULL
 * 
 * @param key
 * @return
 */
private String getUid(Key key) {
    Text startColfam = key.getColumnFamily();
    if (startColfam.find(Constants.NULL) != -1) {
        // have a start key with a document uid, add to the end of the cf to ensure we go to the next doc
        // parse out the uid
        String cf = startColfam.toString();
        int index = cf.indexOf('\0');
        if (index >= 0) {
            
            return cf.substring(index + 1);
        }
    }
    
    return null;
}
 
Example 3
Source File: Phase3Step2DistinctDataJobTest.java    From dkpro-c4corpus with Apache License 2.0 6 votes vote down vote up
@Test
public void testSplit()
        throws Exception
{
    Text key = new Text("123_456789");

    // hard-split using array copy
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    assertEquals("123", fileName);
    assertEquals("456789", outputKey.toString());
}
 
Example 4
Source File: QseqInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
private void setFieldPositionsAndLengths(Text line)
{
	int pos = 0; // the byte position within the record
	int fieldno = 0; // the field index within the record
	while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field
	{
		int endpos = line.find(Delim, pos); // the field's end position
		if (endpos < 0)
			endpos = line.getLength();

		fieldPositions[fieldno] = pos;
		fieldLengths[fieldno] = endpos - pos;

		pos = endpos + 1; // the next starting position is the current end + 1
		fieldno += 1;
	}

	if (fieldno != NUM_QSEQ_COLS)
		throw new FormatException("found " + fieldno + " fields instead of 11 at " +
		            makePositionMessage(this.pos - line.getLength()) + ". Line: " + line);
}
 
Example 5
Source File: QueryMetricsSummaryLoader.java    From datawave with Apache License 2.0 5 votes vote down vote up
private String getTimeUnit(Key key) {
    if (useHourlyPrecision) {
        return DateHelper.formatToHour(key.getTimestamp());
    } else {
        Text row = key.getRow();
        return new String(row.getBytes(), 0, row.find("_"));
    }
}
 
Example 6
Source File: ShardedTableTabletBalancer.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public String apply(KeyExtent extent) {
    String date = "null"; // Don't return null
    if (extent != null) {
        Text endRow = extent.getEndRow();
        if (endRow != null) {
            int sepIdx = endRow.find("_");
            if (sepIdx < 0)
                sepIdx = endRow.getLength();
            date = new String(endRow.getBytes(), 0, sepIdx);
        }
    }
    return date;
}
 
Example 7
Source File: RangeStream.java    From datawave with Apache License 2.0 5 votes vote down vote up
public static boolean isEventSpecific(Range range) {
    Text holder = new Text();
    Key startKey = range.getStartKey();
    startKey.getColumnFamily(holder);
    if (holder.getLength() > 0) {
        if (holder.find("\0") > 0) {
            return true;
        }
    }
    return false;
}
 
Example 8
Source File: QueryIterator.java    From datawave with Apache License 2.0 5 votes vote down vote up
/**
 * Determines if a range is document specific according to the following criteria
 * 
 * <pre>
 *     1. Cannot have a null start or end key
 *     2. Cannot span multiple rows
 *     3. ColumnFamily must contain a null byte separator
 * </pre>
 *
 * @param r
 *            - {@link Range} to be evaluated
 * @return - true if this is a document specific range, false if not.
 */
public static boolean isDocumentSpecificRange(Range r) {
    Preconditions.checkNotNull(r);
    
    // Also @see datawave.query.index.lookup.TupleToRange
    // We have already made the assertion that the client is sending us
    // an inclusive start key due to the inability to ascertain the
    // difference between and event-specific range and a continueMultiScan.
    //
    // As such, it is acceptable for us to make the same assertion on the
    // inclusivity of the start key.
    
    // Cannot have a null start or end key
    if (r.isInfiniteStartKey() || r.isInfiniteStopKey()) {
        return false;
    }
    
    // Cannot span multiple rows.
    Key startKey = r.getStartKey();
    Key endKey = r.getEndKey();
    if (!startKey.getRowData().equals(endKey.getRowData())) {
        return false;
    }
    
    // Column Family must contain a null byte separator.
    Text startCF = startKey.getColumnFamily();
    Text endCF = endKey.getColumnFamily();
    if (startCF.find(Constants.NULL) == -1 || endCF.find(Constants.NULL) == -1) {
        return false;
    }
    return true;
}
 
Example 9
Source File: DocumentDataIterator.java    From datawave with Apache License 2.0 5 votes vote down vote up
protected boolean isEventKey(Key k) {
    Text cf = k.getColumnFamily();
    return cf.getLength() > 0
                    && cf.find("\u0000") != -1
                    && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf
                                    .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f'));
}
 
Example 10
Source File: EventDataScanNestedIterator.java    From datawave with Apache License 2.0 5 votes vote down vote up
boolean isEventKey(Key k) {
    Text cf = k.getColumnFamily();
    return cf.getLength() > 0
                    && cf.find("\u0000") != -1
                    && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf
                                    .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f'));
}
 
Example 11
Source File: UserProvider.java    From hadoop with Apache License 2.0 5 votes vote down vote up
@Override
public synchronized List<String> getKeys() throws IOException {
  List<String> list = new ArrayList<String>();
  List<Text> keys = credentials.getAllSecretKeys();
  for (Text key : keys) {
    if (key.find("@") == -1) {
      list.add(key.toString());
    }
  }
  return list;
}
 
Example 12
Source File: UserProvider.java    From big-c with Apache License 2.0 5 votes vote down vote up
@Override
public synchronized List<String> getKeys() throws IOException {
  List<String> list = new ArrayList<String>();
  List<Text> keys = credentials.getAllSecretKeys();
  for (Text key : keys) {
    if (key.find("@") == -1) {
      list.add(key.toString());
    }
  }
  return list;
}
 
Example 13
Source File: Phase3Step2DistinctDataJob.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public void reduce(Text key, Iterable<NullWritable> values, Context context)
        throws IOException, InterruptedException
{
    // hard-split using array copy to prevent Java Heap Space
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    multipleOutputs.write(new Text(outputKey), NullWritable.get(), fileName);
}
 
Example 14
Source File: AggregatingRecordReader.java    From datawave with Apache License 2.0 4 votes vote down vote up
protected boolean nextKeyValuePastBlock(boolean hasNext) throws IOException {
    Preconditions.checkNotNull(positionAwareLineReader, "positionAwareLineReader cannot be null");
    
    // If we're in the middle of an element
    // have "run out" of data in the current InputSplit
    // if aggValue is empty (meaning, we just cleared it out and are trying to find a new record)
    // and we got to this point, we do not want to read into the next block
    if (aggValue.getLength() > 0 && positionAwareLineReader.getPos() > positionAwareLineReader.getEnd()) {
        int end = aggValue.find(endToken);
        int prevLength = aggValue.getLength();
        Text endRecordFromNextBlock = new Text();
        
        // We want to loop until we can find an end token to match the start token we already have
        while (end == -1) {
            endRecordFromNextBlock.clear();
            
            int newSize = positionAwareLineReader.getLfLineReader().readLine(endRecordFromNextBlock, positionAwareLineReader.getMaxLineLength(),
                            Integer.MAX_VALUE);
            if (0 == newSize) {
                // This fails in the same manner as the process(Text) method does for
                // self-closing XML elements.
                return returnPartialMatches && aggValue.getLength() > 0;
            }
            
            // Track the extra data read
            positionAwareLineReader.setPos(positionAwareLineReader.getPos() + newSize);
            
            // Find the start and end in this next segment read
            int newStart = endRecordFromNextBlock.find(startToken), newEnd = endRecordFromNextBlock.find(endToken);
            
            // We found no start, but did find an end
            if (newStart == -1 && newEnd >= 0) {
                // Append onto the aggValue, and we're done.
                TextUtil.textAppendNoNull(aggValue, Text.decode(endRecordFromNextBlock.getBytes(), 0, newEnd + endToken.length()));
                return true;
            } else if (newStart < newEnd) {
                // We found another start token before an endtoken which
                // would imply malformed XML
                
                if (returnPartialMatches) {
                    // Let's try to be nice and throw everything up until the new start character into the aggregated value.
                    TextUtil.textAppendNoNull(aggValue, Text.decode(endRecordFromNextBlock.getBytes(), 0, newStart));
                    
                    return true;
                } else {
                    return false;
                }
            } else {
                // haven't seen a start or end, so just keep aggregating
                TextUtil.textAppendNoNull(aggValue, endRecordFromNextBlock.toString());
            }
            
            end = aggValue.find(endToken, prevLength);
            prevLength = aggValue.getLength();
        }
        
        return true;
    }
    
    return hasNext;
}
 
Example 15
Source File: EventDataScanNestedIterator.java    From datawave with Apache License 2.0 4 votes vote down vote up
protected void findNextDocument() {
    topKey = null;
    
    try {
        Text cf = new Text();
        
        /*
         * Given that we are already at a document key, this method will continue to advance the underlying source until it is either exhausted (hasTop()
         * returns false), the returned key is not in the totalRange, and the current top key shares the same row and column family as the source's next
         * key.
         */
        while (topKey == null && source.hasTop()) {
            Key k = source.getTopKey();
            if (log.isTraceEnabled())
                log.trace("Sought to " + k);
            k.getColumnFamily(cf);
            
            if (!isEventKey(k)) {
                if (cf.find("fi\0") == 0) {
                    if (log.isDebugEnabled()) {
                        log.debug("Seeking over 'fi')");
                    }
                    // Try to do an optimized jump over the field index
                    cf.set("fi\1");
                    source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()),
                                    columnFamilies, inclusive);
                } else if (cf.getLength() == 1 && cf.charAt(0) == 'd') {
                    if (log.isDebugEnabled()) {
                        log.debug("Seeking over 'd'");
                    }
                    // Try to do an optimized jump over the raw documents
                    cf.set("d\0");
                    source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()),
                                    columnFamilies, inclusive);
                } else if (cf.getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f') {
                    if (log.isDebugEnabled()) {
                        log.debug("Seeking over 'tf'");
                    }
                    // Try to do an optimized jump over the term frequencies
                    cf.set("tf\0");
                    source.seek(new Range(new Key(source.getTopKey().getRow(), cf), false, totalRange.getEndKey(), totalRange.isEndKeyInclusive()),
                                    columnFamilies, inclusive);
                } else {
                    if (log.isDebugEnabled()) {
                        log.debug("Next()'ing over the current key");
                    }
                    source.next();
                }
            } else {
                if (dataTypeFilter.apply(source.getTopKey())) {
                    this.topKey = source.getTopKey();
                } else {
                    Range nextCF = new Range(nextStartKey(source.getTopKey()), true, totalRange.getEndKey(), totalRange.isEndKeyInclusive());
                    source.seek(nextCF, columnFamilies, inclusive);
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Could not seek in findNextDocument", e);
    }
}
 
Example 16
Source File: WikiReverseMapper.java    From wikireverse with MIT License 4 votes vote down vote up
public void map(LongWritable inputKey, WritableWarcRecord inputValue, OutputCollector<Text, LinkArrayWritable> output,
		Reporter reporter, WikiMetadata wikiMetadata)
		throws IOException, InterruptedException {

	try {
		// Get Warc record from the writable wrapper.
		WarcRecord record = inputValue.getRecord();
		String url = record.getHeaderMetadataItem(WARC_TARGET_URI);

		if (wikiMetadata.isWikiPage(url, reporter) == false) {
			Text metadata = new Text(record.getContent());
		
			if (metadata.find(WIKIPEDIA_DOMAIN) >= 0) {
				Page page = new Page(url);
				page = MetadataParser.parse(page, metadata, LINK_TYPE, WIKIPEDIA_DOMAIN);
				Hashtable<String, LinkWritable> results = wikiMetadata.createResults(page, reporter);

				if (results != null && results.isEmpty() == false) {
					Text outputKey = new Text();
					LinkArrayWritable outputValue = new LinkArrayWritable();						
					LinkWritable[] linkArray = new LinkWritable[1];

					for(String key : results.keySet()) {
						linkArray[0] = results.get(key);

						outputKey.set(key);
						outputValue.set(linkArray);
						
						output.collect(outputKey, outputValue);
					}

					reporter.incrCounter(COUNTER_GROUP, URLS_PARSED, results.size());
				}
			}
		}
		
	} catch (URISyntaxException us) {
		reporter.incrCounter(COUNTER_GROUP, URI_SYNTAX_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(us));
	} catch (JsonParseException jp) {
		reporter.incrCounter(COUNTER_GROUP, JSON_PARSE_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(jp));
	} catch (IOException io) {
		reporter.incrCounter(COUNTER_GROUP, MAP_IO_EXCEPTION, 1);
		LOG.error(StringUtils.stringifyException(io));
	} catch (Exception e) {
		try {
			reporter.incrCounter(COUNTER_GROUP, MAP_EXCEPTION, 1);
			LOG.error(StringUtils.stringifyException(e));
		} catch (Exception ie) {
			// Log and consume inner exceptions when logging.
			LOG.error(ie.toString());
		}
	}
}