Java Code Examples for org.apache.spark.unsafe.types.UTF8String

The following examples show how to use org.apache.spark.unsafe.types.UTF8String. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Override
final UTF8String getUTF8String(int rowId) {
  accessor.get(rowId, stringResult);
  if (stringResult.isSet == 0) {
    return null;
  } else {
    ArrowBuf offsets = accessor.getOffsetBuffer();
    int index = rowId * VarCharVector.OFFSET_WIDTH;
    int start = offsets.getInt(index);
    int end = offsets.getInt(index + VarCharVector.OFFSET_WIDTH);

    /* Since the result is accessed lazily if the memory address is corrupted we
     * might lose the data. Might be better to include a byte array. Not doing so
     * for performance reasons.
     */
    return UTF8String.fromAddress(/* base = */null,
        stringResult.buffer.memoryAddress() + start,
        end - start);
  }
}
 
Example 2
Source Project: indexr   Source File: RSIndex_CMap_V2.java    License: Apache License 2.0 6 votes vote down vote up
static void _putValue(long packAddr, UTF8String value) {
    int valueSize = value.numBytes();
    Object valueBase = value.getBaseObject();
    long valueOffset = value.getBaseOffset();

    long offset = packAddr + indexOffsetBySize(valueSize);
    int checkSize = valueSize < MAX_POSISTIONS ? valueSize : MAX_POSISTIONS;

    if (checkSize == 0) {
        // mark empty string exists.
        set(offset, (byte) 1, 0);
    } else {
        for (int pos = 0; pos < checkSize; pos++) {
            set(offset, Platform.getByte(valueBase, valueOffset + pos), pos);
        }
    }
}
 
Example 3
Source Project: indexr   Source File: RSIndex_CMap.java    License: Apache License 2.0 6 votes vote down vote up
public static byte _isLike(long packAddr, LikePattern pattern) {
    // We can exclude cases like "ala%" and "a_l_a%"

    UTF8String original = pattern.original;
    int bytes = original.numBytes();
    Object valueBase = original.getBaseObject();
    long valueOffset = original.getBaseOffset();
    int indexSize = bytes < POSISTIONS ? bytes : POSISTIONS;

    for (int pos = 0; pos < indexSize; pos++) {
        byte c = Platform.getByte(valueBase, valueOffset + pos);
        // The ESCAPE_CHARACTOR case can be optimized. But I'm too lazy...
        if (c == '%' || c == ESCAPE_CHARACTOR) {
            break;
        }
        if (c != '_' && !isSet(packAddr, c, pos)) {
            return RSValue.None;
        }
    }
    return RSValue.Some;
}
 
Example 4
Source Project: indexr   Source File: RoughCheck_R.java    License: Apache License 2.0 6 votes vote down vote up
public static byte inCheckOnPack(Column column, int packId, UTF8String[] values) throws IOException {
    RSIndexStr index = column.rsIndex();
    boolean none = true;
    for (UTF8String value : values) {
        byte vRes = index.isValue(packId, value);
        if (vRes == All) {
            return All;
        } else if (vRes == Some) {
            none = false;
            // There are very little chances be All after Some, so jump out fast.
            break;
        }
    }
    if (none) {
        return None;
    } else {
        return Some;
    }
}
 
Example 5
Source Project: indexr   Source File: ExtIndex_SimpleBits.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public BitMap like(Column column, int packId, long numValue, UTF8String strValue) throws IOException {
    DataPack pack = column.pack(packId);
    int rowCount = pack.valueCount();
    BitMap res = new BitMap();
    switch (dataType) {
        case ColumnType.STRING: {
            for (int rowId = 0; rowId < rowCount; rowId++) {
                if (SQLLike.match(pack.stringValueAt(rowId), strValue)) {
                    res.set(rowId);
                }
            }
            break;
        }
        default:
            throw new IllegalStateException("column type " + dataType + " is illegal in LIKE");
    }
    return fixBitmapInPack(res, rowCount);
}
 
Example 6
Source Project: indexr   Source File: VirtualDataPack.java    License: Apache License 2.0 6 votes vote down vote up
private static Object allocateValues(byte dataType, int cap) {
    switch (dataType) {
        case ColumnType.INT:
            return new int[cap];
        case ColumnType.LONG:
            return new long[cap];
        case ColumnType.FLOAT:
            return new float[cap];
        case ColumnType.DOUBLE:
            return new double[cap];
        case ColumnType.STRING:
            return new UTF8String[cap];
        default:
            throw new IllegalArgumentException(String.format("Not support data type of %s", dataType));
    }
}
 
Example 7
Source Project: indexr   Source File: ExtIndex_DictBits.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public BitMap like(Column column, int packId, long numValue, UTF8String strValue) throws IOException {
    DataPack pack = column.pack(packId);
    int rowCount = pack.valueCount();
    BitMap res = new BitMap();
    // TODO optimize for like xxx%
    switch (dataType) {
        case ColumnType.STRING: {
            for (int rowId = 0; rowId < rowCount; rowId++) {
                if (SQLLike.match(pack.stringValueAt(rowId), strValue)) {
                    res.set(rowId);
                }
            }
            break;
        }
        default:
            throw new IllegalStateException("column type " + dataType + " is illegal in LIKE");
    }
    return fixBitmapInPack(res, rowCount);
}
 
Example 8
Source Project: indexr   Source File: VersionAdapter_VLT.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PackBundle createPackBundle(int version, SegmentMode mode, byte dataType, boolean isIndexed, VirtualDataPack cache) {
    Object values = cache.cacheValues();
    int size = cache.valueCount();
    switch (dataType) {
        case ColumnType.INT:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (int[]) values, 0, size);
        case ColumnType.LONG:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (long[]) values, 0, size);
        case ColumnType.FLOAT:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (float[]) values, 0, size);
        case ColumnType.DOUBLE:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (double[]) values, 0, size);
        case ColumnType.STRING:
            return DataPackCreator_VLT.from(version, mode, isIndexed, Arrays.asList((UTF8String[]) values).subList(0, size));
        default:
            throw new IllegalArgumentException(String.format("Not support data type of %s", dataType));
    }
}
 
Example 9
Source Project: indexr   Source File: ExtIndex_DictBits.java    License: Apache License 2.0 6 votes vote down vote up
private int searchEntry(DictStruct struct, long numValue, UTF8String strValue) throws IOException {
    int dictEntryCount = struct.dictEntryCount();
    int entryId;
    switch (dataType) {
        case ColumnType.INT:
            entryId = BinarySearch.binarySearchInts(struct.dictEntriesAddr(), dictEntryCount, (int) numValue);
            break;
        case ColumnType.LONG:
            entryId = BinarySearch.binarySearchLongs(struct.dictEntriesAddr(), dictEntryCount, numValue);
            break;
        case ColumnType.FLOAT:
            entryId = BinarySearch.binarySearchFloats(struct.dictEntriesAddr(), dictEntryCount, (float) Double.longBitsToDouble(numValue));
            break;
        case ColumnType.DOUBLE:
            entryId = BinarySearch.binarySearchDoubles(struct.dictEntriesAddr(), dictEntryCount, Double.longBitsToDouble(numValue));
            break;
        case ColumnType.STRING:
            StringsStruct strings = struct.stringDictEntries();
            entryId = BinarySearch.binarySearchStrings(strings, strValue.getBaseObject(), strValue.getBaseOffset(), strValue.numBytes());
            break;
        default:
            throw new RuntimeException("Illegal dataType: " + dataType);
    }
    return entryId;
}
 
Example 10
@Override
final UTF8String getUTF8String(int rowId) {
  long epoch = accessor.get(rowId);

  LocalDateTime dateTime = LocalDateTime.ofEpochSecond(epoch / 1000000,
      (int)(epoch % 1_000_000) * 1000,
      ZoneOffset.UTC);

  return UTF8String.fromString(dateTime.toString());
}
 
Example 11
@Override
public UTF8String getUTF8String(int rowId) {
  if (isNullAt(rowId)) {
    return null;
  }
  return accessor.getUTF8String(rowId);
}
 
Example 12
Source Project: indexr   Source File: RSIndex_CMap.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public byte isValue(int packId, UTF8String value) {
    assert packId >= 0 && packId < packCount;

    long packAddr = bufferAddr + (packId * (POSISTIONS << POSITION_BYTE_SHIFT));
    return _isValue(packAddr, value);
}
 
Example 13
Source Project: indexr   Source File: ExtIndex_DictBits.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public BitMap between(Column column, int packId, long numValue1, long numValue2, UTF8String strValue1, UTF8String strValue2) throws IOException {
    DataPack pack = column.pack(packId);
    DataPackNode dpn = column.dpn(packId);
    DictStruct struct = pack.dictStruct(dataType, dpn);
    int entryId1 = searchEntry(struct, numValue1, strValue1);
    int entryId2 = searchEntry(struct, numValue2, strValue2);

    int dictEntryCount = struct.dictEntryCount();

    // [from, to)
    int from = entryId1 >= 0 ? entryId1 : (-entryId1 - 1);
    int to = entryId2 >= 0 ? entryId2 + 1 : (-entryId2 - 1);

    Preconditions.checkState(from >= 0 && from < dictEntryCount);
    Preconditions.checkState(to >= 0 && to <= dictEntryCount);

    if (to - from <= 0) {
        return BitMap.NONE;
    } else if (to - from >= dictEntryCount) {
        return BitMap.ALL;
    }

    long dataAddr = struct.dataAddr();
    BitMap bitSet = new BitMap();
    for (int i = 0; i < dpn.objCount(); i++) {
        int id = MemoryUtil.getInt(dataAddr + (i << 2));
        if (id >= from && id < to) {
            bitSet.set(i);
        }
    }
    return bitSet;
}
 
Example 14
Source Project: indexr   Source File: BHCompressTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test_string() {
    int item_size = 65536;
    List<UTF8String> strings = new ArrayList<>();
    for (int i = 0; i < item_size; i++) {
        strings.add(UTF8String.fromString(RandomStringUtils.random(20)));
    }
    ByteSlice bb = _from_v1(strings);
    ByteSlice cmp = BHCompressor.compressIndexedStr_v1(bb, item_size);
    ByteSlice dp = BHCompressor.decompressIndexedStr_v1(cmp, item_size);

    Assert.assertEquals(true, ByteSlice.checkEquals(bb, dp));
}
 
Example 15
Source Project: iceberg   Source File: SparkValueWriters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void write(UTF8String s, Encoder encoder) throws IOException {
  // use getBytes because it may return the backing byte array if available.
  // otherwise, it copies to a new byte array, which is still cheaper than Avro
  // calling toString, which incurs encoding costs
  encoder.writeString(new Utf8(s.getBytes()));
}
 
Example 16
Source Project: iceberg   Source File: SparkValueWriters.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void write(UTF8String s, Encoder encoder) throws IOException {
  // TODO: direct conversion from string to byte buffer
  UUID uuid = UUID.fromString(s.toString());
  ByteBuffer buffer = BUFFER.get();
  buffer.rewind();
  buffer.putLong(uuid.getMostSignificantBits());
  buffer.putLong(uuid.getLeastSignificantBits());
  encoder.writeFixed(buffer.array());
}
 
Example 17
Source Project: iceberg   Source File: IcebergArrowColumnVector.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String getUTF8String(int rowId) {
  if (isNullAt(rowId)) {
    return null;
  }
  return accessor.getUTF8String(rowId);
}
 
Example 18
Source Project: iceberg   Source File: ArrowVectorAccessors.java    License: Apache License 2.0 5 votes vote down vote up
@Override
final UTF8String getUTF8String(int rowId) {
  vector.get(rowId, stringResult);
  if (stringResult.isSet == 0) {
    return null;
  } else {
    return UTF8String.fromAddress(
        null,
        stringResult.buffer.memoryAddress() + stringResult.start,
        stringResult.end - stringResult.start);
  }
}
 
Example 19
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(UTF8String ignored) {
  Binary binary = column.nextBinary();
  ByteBuffer buffer = binary.toByteBuffer();
  if (buffer.hasArray()) {
    return UTF8String.fromBytes(
        buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
  } else {
    return UTF8String.fromBytes(binary.getBytes());
  }
}
 
Example 20
Source Project: indexr   Source File: Between.java    License: Apache License 2.0 5 votes vote down vote up
@JsonCreator
public Between(@JsonProperty("attr") Attr attr,
               @JsonProperty("numValue1") long numValue1,
               @JsonProperty("numValue2") long numValue2,
               @JsonProperty("strValue1") String strValue1,
               @JsonProperty("strValue2") String strValue2) {
    this(attr, numValue1, numValue2,
            strValue1 == null ? null : UTF8String.fromString(strValue1),
            strValue2 == null ? null : UTF8String.fromString(strValue2));
}
 
Example 21
Source Project: iceberg   Source File: SparkValueReaders.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(Decoder decoder, Object reuse) throws IOException {
  ByteBuffer buffer = BUFFER.get();
  buffer.rewind();

  decoder.readFixed(buffer.array(), 0, 16);
  long mostSigBits = buffer.getLong();
  long leastSigBits = buffer.getLong();

  return UTF8String.fromString(new UUID(mostSigBits, leastSigBits).toString());
}
 
Example 22
Source Project: iceberg   Source File: StructInternalRow.java    License: Apache License 2.0 5 votes vote down vote up
private ArrayData collectionToArrayData(Type elementType, Collection<?> values) {
  switch (elementType.typeId()) {
    case BOOLEAN:
    case INTEGER:
    case DATE:
    case TIME:
    case LONG:
    case TIMESTAMP:
    case FLOAT:
    case DOUBLE:
      return fillArray(values, array -> (pos, value) -> array[pos] = value);
    case STRING:
      return fillArray(values, array ->
          (BiConsumer<Integer, CharSequence>) (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString()));
    case FIXED:
    case BINARY:
      return fillArray(values, array ->
          (BiConsumer<Integer, ByteBuffer>) (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf));
    case DECIMAL:
      return fillArray(values, array ->
          (BiConsumer<Integer, BigDecimal>) (pos, dec) -> array[pos] = Decimal.apply(dec));
    case STRUCT:
      return fillArray(values, array -> (BiConsumer<Integer, StructLike>) (pos, tuple) ->
          array[pos] = new StructInternalRow(elementType.asStructType(), tuple));
    case LIST:
      return fillArray(values, array -> (BiConsumer<Integer, Collection<?>>) (pos, list) ->
          array[pos] = collectionToArrayData(elementType.asListType(), list));
    case MAP:
      return fillArray(values, array -> (BiConsumer<Integer, Map<?, ?>>) (pos, map) ->
          array[pos] = mapToMapData(elementType.asMapType(), map));
    default:
      throw new UnsupportedOperationException("Unsupported array element type: " + elementType);
  }
}
 
Example 23
Source Project: iceberg   Source File: PartitionKey.java    License: Apache License 2.0 5 votes vote down vote up
private Object defensiveCopyIfNeeded(Object obj) {
  if (obj instanceof UTF8String) {
    // bytes backing the UTF8 string might be reused
    byte[] bytes = ((UTF8String) obj).getBytes();
    return UTF8String.fromBytes(Arrays.copyOf(bytes, bytes.length));
  }
  return obj;
}
 
Example 24
Source Project: iceberg   Source File: SparkExpressions.java    License: Apache License 2.0 5 votes vote down vote up
private static Object valueFromSpark(Literal lit) {
  if (lit.value() instanceof UTF8String) {
    return lit.value().toString();
  } else if (lit.value() instanceof Decimal) {
    return ((Decimal) lit.value()).toJavaBigDecimal();
  }
  return lit.value();
}
 
Example 25
Source Project: iceberg   Source File: SparkParquetReaders.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(UTF8String ignored) {
  Binary binary = column.nextBinary();
  ByteBuffer buffer = binary.toByteBuffer();
  if (buffer.hasArray()) {
    return UTF8String.fromBytes(
        buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
  } else {
    return UTF8String.fromBytes(binary.getBytes());
  }
}
 
Example 26
Source Project: iceberg   Source File: SparkValueReaders.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(Decoder decoder, Object reuse) throws IOException {
  ByteBuffer buffer = BUFFER.get();
  buffer.rewind();

  decoder.readFixed(buffer.array(), 0, 16);
  long mostSigBits = buffer.getLong();
  long leastSigBits = buffer.getLong();

  return UTF8String.fromString(new UUID(mostSigBits, leastSigBits).toString());
}
 
Example 27
Source Project: iceberg   Source File: PartitionKey.java    License: Apache License 2.0 5 votes vote down vote up
private Object defensiveCopyIfNeeded(Object obj) {
  if (obj instanceof UTF8String) {
    // bytes backing the UTF8 string might be reused
    byte[] bytes = ((UTF8String) obj).getBytes();
    return UTF8String.fromBytes(Arrays.copyOf(bytes, bytes.length));
  }
  return obj;
}
 
Example 28
Source Project: iceberg   Source File: RandomData.java    License: Apache License 2.0 5 votes vote down vote up
private static UTF8String randomString(Random random) {
  int length = random.nextInt(50);
  byte[] buffer = new byte[length];

  for (int i = 0; i < length; i += 1) {
    buffer[i] = (byte) CHARS.charAt(random.nextInt(CHARS.length()));
  }

  return UTF8String.fromBytes(buffer);
}
 
Example 29
Source Project: indexr   Source File: OuterIndex_Inverted.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public BitMap in(Column column, long[] numValues, UTF8String[] strValues, boolean isNot) throws IOException {
    int inCount = ColumnType.isNumber(dataType) ? numValues.length : strValues.length;
    int[] entryIds = new int[inCount];
    int entryIdCount = 0;
    for (int i = 0; i < inCount; i++) {
        long numValue = numValues == null ? 0 : numValues[i];
        UTF8String strValue = strValues == null ? null : strValues[i];
        int entryId = searchEntry(numValue, strValue);
        if (entryId >= 0) {
            entryIds[entryIdCount++] = entryId;
        }
    }

    if (entryIdCount == 0) {
        return isNot ? BitMap.ALL : BitMap.NONE;
    } else if (dictEntryCount == 1) {
        return isNot ? BitMap.NONE : BitMap.ALL;
    }

    if (isNot) {
        // "not in" can not be handled by this index.
        return BitMap.ALL;
    }

    DirectBitMap bitmap = new DirectBitMap(packCount);
    MergeBitMapUtil.readAndMergeBitmaps(bitmapReader, bitmap, Trick.subArray(entryIds, 0, entryIdCount));
    return new BitMap(bitmap, packCount);
}
 
Example 30
Source Project: indexr   Source File: In.java    License: Apache License 2.0 5 votes vote down vote up
static UTF8String[] toUTF8Arr(String[] strValues) {
    if (strValues == null) {
        return null;
    }
    UTF8String[] vs = new UTF8String[strValues.length];
    for (int i = 0; i < strValues.length; i++) {
        vs[i] = strValues[i] == null ? null : UTF8String.fromString(strValues[i]);
    }
    return vs;
}