Java Code Examples for org.apache.hadoop.io.Text#getLength()

The following examples show how to use org.apache.hadoop.io.Text#getLength() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RangeKeyDistributionMapper.java    From Kylin with Apache License 2.0 6 votes vote down vote up
@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    lastKey = key;

    int bytesLength = key.getLength() + value.getLength();
    bytesRead += bytesLength;

    if (bytesRead >= ONE_MEGA_BYTES) {
        outputValue.set(bytesRead);
        context.write(key, outputValue);

        // reset bytesRead
        bytesRead = 0;
    }

}
 
Example 2
Source File: QseqInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
private void setFieldPositionsAndLengths(Text line)
{
	int pos = 0; // the byte position within the record
	int fieldno = 0; // the field index within the record
	while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field
	{
		int endpos = line.find(Delim, pos); // the field's end position
		if (endpos < 0)
			endpos = line.getLength();

		fieldPositions[fieldno] = pos;
		fieldLengths[fieldno] = endpos - pos;

		pos = endpos + 1; // the next starting position is the current end + 1
		fieldno += 1;
	}

	if (fieldno != NUM_QSEQ_COLS)
		throw new FormatException("found " + fieldno + " fields instead of 11 at " +
		            makePositionMessage(this.pos - line.getLength()) + ". Line: " + line);
}
 
Example 3
Source File: AccumuloUtils.java    From mrgeo with Apache License 2.0 6 votes vote down vote up
/**
 * Convert a Text object of a tileId to a back to a long.
 *
 * @param rowId Text object to convert.
 * @return the long value from the Text object.
 */
public static long toLong(Text rowId)
{

  byte[] outB = new byte[8];
  for (int x = 0; x < outB.length; x++)
  {
    if (x >= rowId.getLength())
    {
      outB[x] = 0x0;
    }
    else
    {
      outB[x] = rowId.getBytes()[x];
    }
  }

  return ByteBuffer.wrap(outB).getLong();
}
 
Example 4
Source File: ShardedTableTabletBalancer.java    From datawave with Apache License 2.0 6 votes vote down vote up
private byte[] retrieveDate(KeyExtent extent) {
    Text endRow = extent.getEndRow();
    if (endRow == null)
        endRow = extent.getPrevEndRow();
    if (endRow == null) {
        log.warn("Attempting to retrieve date from empty extent " + extent + ". Is your sharded table pre-split?");
        return "null".getBytes();
    } else {
        int idx = endRow.find("_");
        if (idx <= 0) {
            idx = endRow.getLength();
            log.warn("Extent " + extent + " does not conform to sharded date scheme yyyyMMdd_num");
        }
        return Arrays.copyOf(endRow.getBytes(), idx);
    }
}
 
Example 5
Source File: TeraSort.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly.
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, 
                                  Text prefix, int maxDepth) {
  int depth = prefix.getLength();
  if (depth >= maxDepth || lower == upper) {
    return new LeafTrieNode(depth, splits, lower, upper);
  }
  InnerTrieNode result = new InnerTrieNode(depth);
  Text trial = new Text(prefix);
  // append an extra byte on to the prefix
  trial.append(new byte[1], 0, 1);
  int currentBound = lower;
  for(int ch = 0; ch < 255; ++ch) {
    trial.getBytes()[depth] = (byte) (ch + 1);
    lower = currentBound;
    while (currentBound < upper) {
      if (splits[currentBound].compareTo(trial) >= 0) {
        break;
      }
      currentBound += 1;
    }
    trial.getBytes()[depth] = (byte) ch;
    result.child[ch] = buildTrie(splits, lower, currentBound, trial, 
                                 maxDepth);
  }
  // pick up the rest
  trial.getBytes()[depth] = 127;
  result.child[255] = buildTrie(splits, currentBound, upper, trial,
                                maxDepth);
  return result;
}
 
Example 6
Source File: ToISO8601UDF.java    From occurrence with Apache License 2.0 5 votes vote down vote up
public Text evaluate(Text field) {
  if (field == null || field.getLength() == 0) {
    return null;
  } else {
    try {
      text.set(DownloadUtils.ISO_8601_ZONED.format(Instant.ofEpochMilli(Long.parseLong(field.toString())).atZone(ZoneOffset.UTC)));
      return text;
    } catch (NumberFormatException e) {
      return null;
    }
  }
}
 
Example 7
Source File: EventDataScanNestedIterator.java    From datawave with Apache License 2.0 5 votes vote down vote up
boolean isEventKey(Key k) {
    Text cf = k.getColumnFamily();
    return cf.getLength() > 0
                    && cf.find("\u0000") != -1
                    && !((cf.charAt(0) == 'f' && cf.charAt(1) == 'i' && cf.charAt(2) == 0) || (cf.getLength() == 1 && cf.charAt(0) == 'd') || (cf
                                    .getLength() == 2 && cf.charAt(0) == 't' && cf.charAt(1) == 'f'));
}
 
Example 8
Source File: XmlSerDe.java    From Hive-XML-SerDe with Apache License 2.0 5 votes vote down vote up
/**
 * @see org.apache.hadoop.hive.serde2.Deserializer#deserialize(org.apache.hadoop.io.Writable)
 */
@Override
public Object deserialize(Writable writable) throws SerDeException {
    Text text = (Text) writable;
    if (text == null || text.getLength() == 0) {
        return (Object) null;
    }
    try {
        return this.xmlProcessor.parse(text.toString());
    } catch (Exception e) {
        throw new SerDeException(e);
    }
}
 
Example 9
Source File: TeraSort.java    From RDFS with Apache License 2.0 5 votes vote down vote up
int findPartition(Text key) {
  int level = getLevel();
  if (key.getLength() <= level) {
    return child[0].findPartition(key);
  }
  return child[key.getBytes()[level]].findPartition(key);
}
 
Example 10
Source File: TopKDataGen.java    From sequenceiq-samples with Apache License 2.0 5 votes vote down vote up
@Override
public void run() throws Exception {
    KeyValueWriter streamOutputWriter = (KeyValueWriter) getOutputs().get(OUTPUT).getWriter();
    long sizeLarge = 0;
    while (sizeLarge < streamOutputFileSize) {
        String str = createRowString();
        Text text = new Text(str);
        int size = text.getLength();
        streamOutputWriter.write(text, NullWritable.get());
        sizeLarge += size;
    }
}
 
Example 11
Source File: DeflateUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
@Override
public BytesWritable evaluate(DeferredObject[] arguments) throws HiveException {
    if (codec == null) {
        this.codec = new DeflateCodec(true, false);
    }

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }
    Text text = stringOI.getPrimitiveWritableObject(arg0);
    byte[] original = text.getBytes();
    final int len = text.getLength();
    final byte[] compressed;
    try {
        compressed = codec.compress(original, 0, len, compressionLevel);
    } catch (IOException e) {
        throw new HiveException("Failed to compress", e);
    }
    original = null;
    if (result == null) {
        this.result = new BytesWritable(compressed);
    } else {
        result.set(compressed, 0, compressed.length);
    }
    return result;
}
 
Example 12
Source File: LineRecordReader.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private int skipUtfByteOrderMark(Text value) throws IOException {
  // Strip BOM(Byte Order Mark)
  // Text only support UTF-8, we only need to check UTF-8 BOM
  // (0xEF,0xBB,0xBF) at the start of the text stream.
  int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength,
      Integer.MAX_VALUE);
  int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
  // Even we read 3 extra bytes for the first line,
  // we won't alter existing behavior (no backwards incompat issue).
  // Because the newSize is less than maxLineLength and
  // the number of bytes copied to Text is always no more than newSize.
  // If the return size from readLine is not less than maxLineLength,
  // we will discard the current line and read the next line.
  pos += newSize;
  int textLength = value.getLength();
  byte[] textBytes = value.getBytes();
  if ((textLength >= 3) && (textBytes[0] == (byte)0xEF) &&
      (textBytes[1] == (byte)0xBB) && (textBytes[2] == (byte)0xBF)) {
    // find UTF-8 BOM, strip it.
    LOG.info("Found UTF-8 BOM and skipped it");
    textLength -= 3;
    newSize -= 3;
    if (textLength > 0) {
      // It may work to use the same buffer and not do the copyBytes
      textBytes = value.copyBytes();
      value.set(textBytes, 3, textLength);
    } else {
      value.clear();
    }
  }
  return newSize;
}
 
Example 13
Source File: ShardedTableTabletBalancer.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public String apply(KeyExtent extent) {
    String date = "null"; // Don't return null
    if (extent != null) {
        Text endRow = extent.getEndRow();
        if (endRow != null) {
            int sepIdx = endRow.find("_");
            if (sepIdx < 0)
                sepIdx = endRow.getLength();
            date = new String(endRow.getBytes(), 0, sepIdx);
        }
    }
    return date;
}
 
Example 14
Source File: TeraSort.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly.
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, 
                                  Text prefix, int maxDepth) {
  int depth = prefix.getLength();
  if (depth >= maxDepth || lower == upper) {
    return new LeafTrieNode(depth, splits, lower, upper);
  }
  InnerTrieNode result = new InnerTrieNode(depth);
  Text trial = new Text(prefix);
  // append an extra byte on to the prefix
  trial.append(new byte[1], 0, 1);
  int currentBound = lower;
  for(int ch = 0; ch < 255; ++ch) {
    trial.getBytes()[depth] = (byte) (ch + 1);
    lower = currentBound;
    while (currentBound < upper) {
      if (splits[currentBound].compareTo(trial) >= 0) {
        break;
      }
      currentBound += 1;
    }
    trial.getBytes()[depth] = (byte) ch;
    result.child[ch] = buildTrie(splits, lower, currentBound, trial, 
                                 maxDepth);
  }
  // pick up the rest
  trial.getBytes()[depth] = 127;
  result.child[255] = buildTrie(splits, currentBound, upper, trial,
                                maxDepth);
  return result;
}
 
Example 15
Source File: TreeExportUDF.java    From incubator-hivemall with Apache License 2.0 5 votes vote down vote up
@Nonnull
public Text export(@Nonnull Text model, @Nullable String[] featureNames,
        @Nullable String[] classNames) throws HiveException {
    int length = model.getLength();
    byte[] b = model.getBytes();
    b = Base91.decode(b, 0, length);

    final String exported;
    if (regression) {
        exported = exportRegressor(b, featureNames);
    } else {
        exported = exportClassifier(b, featureNames, classNames);
    }
    return new Text(exported);
}
 
Example 16
Source File: RandomTextWriterJob.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Given an output filename, write a bunch of random records to it.
 */
public void map(Text key, Text value,
                Context context) throws IOException,InterruptedException {
  int itemCount = 0;
  while (numBytesToWrite > 0) {
    // Generate the key/value 
    int noWordsKey = minWordsInKey + 
      (wordsInKeyRange != 0 ? random.nextInt(wordsInKeyRange) : 0);
    int noWordsValue = minWordsInValue + 
      (wordsInValueRange != 0 ? random.nextInt(wordsInValueRange) : 0);
    Text keyWords = generateSentence(noWordsKey);
    Text valueWords = generateSentence(noWordsValue);
    
    // Write the sentence 
    context.write(keyWords, valueWords);
    
    numBytesToWrite -= (keyWords.getLength() + valueWords.getLength());
    
    // Update counters, progress etc.
    context.getCounter(Counters.BYTES_WRITTEN).increment(
              keyWords.getLength() + valueWords.getLength());
    context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
    if (++itemCount % 200 == 0) {
      context.setStatus("wrote record " + itemCount + ". " + 
                         numBytesToWrite + " bytes left.");
    }
  }
  context.setStatus("done with " + itemCount + " records.");
}
 
Example 17
Source File: RandomTextWriter.java    From tez with Apache License 2.0 5 votes vote down vote up
/**
 * Given an output filename, write a bunch of random records to it.
 */
public void map(Text key, Text value,
                Context context) throws IOException,InterruptedException {
  int itemCount = 0;
  while (numBytesToWrite > 0) {
    // Generate the key/value 
    int noWordsKey = minWordsInKey + 
      (wordsInKeyRange != 0 ? random.nextInt(wordsInKeyRange) : 0);
    int noWordsValue = minWordsInValue + 
      (wordsInValueRange != 0 ? random.nextInt(wordsInValueRange) : 0);
    Text keyWords = generateSentence(noWordsKey);
    Text valueWords = generateSentence(noWordsValue);
    
    // Write the sentence 
    context.write(keyWords, valueWords);
    
    numBytesToWrite -= (keyWords.getLength() + valueWords.getLength());
    
    // Update counters, progress etc.
    context.getCounter(Counters.BYTES_WRITTEN).increment(
              keyWords.getLength() + valueWords.getLength());
    context.getCounter(Counters.RECORDS_WRITTEN).increment(1);
    if (++itemCount % 200 == 0) {
      context.setStatus("wrote record " + itemCount + ". " + 
                         numBytesToWrite + " bytes left.");
    }
  }
  context.setStatus("done with " + itemCount + " records.");
}
 
Example 18
Source File: PigStorage.java    From spork with Apache License 2.0 4 votes vote down vote up
@Override
public Tuple getNext() throws IOException {
    mProtoTuple = new ArrayList<Object>();
    if (!mRequiredColumnsInitialized) {
        if (signature!=null) {
            Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass());
            mRequiredColumns = (boolean[])ObjectSerializer.deserialize(p.getProperty(signature));
        }
        mRequiredColumnsInitialized = true;
    }
    //Prepend input source path if source tagging is enabled
    if(tagFile) {
        mProtoTuple.add(new DataByteArray(sourcePath.getName()));
    } else if (tagPath) {
        mProtoTuple.add(new DataByteArray(sourcePath.toString()));
    }

    try {
        boolean notDone = in.nextKeyValue();
        if (!notDone) {
            return null;
        }
        Text value = (Text) in.getCurrentValue();
        byte[] buf = value.getBytes();
        int len = value.getLength();
        int start = 0;
        int fieldID = 0;
        for (int i = 0; i < len; i++) {
            if (buf[i] == fieldDel) {
                if (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))
                    addTupleValue(mProtoTuple, buf, start, i);
                start = i + 1;
                fieldID++;
            }
        }
        // pick up the last field
        if (start <= len && (mRequiredColumns==null || (mRequiredColumns.length>fieldID && mRequiredColumns[fieldID]))) {
            addTupleValue(mProtoTuple, buf, start, len);
        }
        Tuple t =  mTupleFactory.newTupleNoCopy(mProtoTuple);

        return dontLoadSchema ? t : applySchema(t);
    } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode,
                PigException.REMOTE_ENVIRONMENT, e);
    }
}
 
Example 19
Source File: QseqInputFormat.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
private void scanQseqLine(Text line, Text key, SequencedFragment fragment)
{
	setFieldPositionsAndLengths(line);

	// Build the key.  We concatenate all fields from 0 to 5 (machine to y-pos)
	// and then the read number, replacing the tabs with colons.
	key.clear();
	// append up and including field[5]
	key.append(line.getBytes(), 0, fieldPositions[5] + fieldLengths[5]);
	// replace tabs with :
	byte[] bytes = key.getBytes();
	int temporaryEnd = key.getLength();
	for (int i = 0; i < temporaryEnd; ++i)
		if (bytes[i] == '\t')
			bytes[i] = ':';
	// append the read number
	key.append(line.getBytes(), fieldPositions[7] - 1, fieldLengths[7] + 1); // +/- 1 to catch the preceding tab.
	// convert the tab preceding the read number into a :
	key.getBytes()[temporaryEnd] = ':';

	// now the fragment
	try
	{
		fragment.clear();
		fragment.setInstrument( Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0]) );
		fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1])) );
		//fragment.setFlowcellId();
		fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2])) );
		fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3])) );
		fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4])) );
		fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5])) );
		fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7])) );
		fragment.setFilterPassed( line.getBytes()[fieldPositions[10]] != '0' );
		//fragment.setControlNumber();
		if (fieldLengths[6] > 0 && line.getBytes()[fieldPositions[6]] == '0') // 0 is a null index sequence
			fragment.setIndexSequence(null);
		else
			fragment.setIndexSequence(Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N'));
	}
	catch (CharacterCodingException e) {
		throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line);
	}

	fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]);
	fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]);
}
 
Example 20
Source File: ShardedDataTypeHandler.java    From datawave with Apache License 2.0 3 votes vote down vote up
/**
 * Create Key from input parameters
 * 
 * For global index keys, the granularity of the timestamp is to the millisecond, where the semantics of the index record is to the day. This makes
 * MapReduce unable to reduce all index keys together unless they occurred at the same millisecond. If we truncate the timestamp to the day, we should
 * reduce the number of keys output from a job.
 * 
 * @param row
 * @param colf
 * @param colq
 * @param vis
 * @param ts
 * @param delete
 * @return Accumulo Key object
 */
protected Key createIndexKey(byte[] row, Text colf, Text colq, byte[] vis, long ts, boolean delete) {
    // Truncate the timestamp to the day
    long tsToDay = (ts / MS_PER_DAY) * MS_PER_DAY;
    
    Key k = new Key(row, 0, row.length, colf.getBytes(), 0, colf.getLength(), colq.getBytes(), 0, colq.getLength(), vis, 0, vis.length, tsToDay);
    k.setDeleted(delete);
    return k;
}