org.apache.spark.unsafe.types.UTF8String Java Examples

The following examples show how to use org.apache.spark.unsafe.types.UTF8String. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExtIndex_DictBits.java    From indexr with Apache License 2.0 6 votes vote down vote up
private int searchEntry(DictStruct struct, long numValue, UTF8String strValue) throws IOException {
    int dictEntryCount = struct.dictEntryCount();
    int entryId;
    switch (dataType) {
        case ColumnType.INT:
            entryId = BinarySearch.binarySearchInts(struct.dictEntriesAddr(), dictEntryCount, (int) numValue);
            break;
        case ColumnType.LONG:
            entryId = BinarySearch.binarySearchLongs(struct.dictEntriesAddr(), dictEntryCount, numValue);
            break;
        case ColumnType.FLOAT:
            entryId = BinarySearch.binarySearchFloats(struct.dictEntriesAddr(), dictEntryCount, (float) Double.longBitsToDouble(numValue));
            break;
        case ColumnType.DOUBLE:
            entryId = BinarySearch.binarySearchDoubles(struct.dictEntriesAddr(), dictEntryCount, Double.longBitsToDouble(numValue));
            break;
        case ColumnType.STRING:
            StringsStruct strings = struct.stringDictEntries();
            entryId = BinarySearch.binarySearchStrings(strings, strValue.getBaseObject(), strValue.getBaseOffset(), strValue.numBytes());
            break;
        default:
            throw new RuntimeException("Illegal dataType: " + dataType);
    }
    return entryId;
}
 
Example #2
Source File: ExtIndex_SimpleBits.java    From indexr with Apache License 2.0 6 votes vote down vote up
@Override
public BitMap like(Column column, int packId, long numValue, UTF8String strValue) throws IOException {
    DataPack pack = column.pack(packId);
    int rowCount = pack.valueCount();
    BitMap res = new BitMap();
    switch (dataType) {
        case ColumnType.STRING: {
            for (int rowId = 0; rowId < rowCount; rowId++) {
                if (SQLLike.match(pack.stringValueAt(rowId), strValue)) {
                    res.set(rowId);
                }
            }
            break;
        }
        default:
            throw new IllegalStateException("column type " + dataType + " is illegal in LIKE");
    }
    return fixBitmapInPack(res, rowCount);
}
 
Example #3
Source File: RSIndex_CMap.java    From indexr with Apache License 2.0 6 votes vote down vote up
public static byte _isLike(long packAddr, LikePattern pattern) {
    // We can exclude cases like "ala%" and "a_l_a%"

    UTF8String original = pattern.original;
    int bytes = original.numBytes();
    Object valueBase = original.getBaseObject();
    long valueOffset = original.getBaseOffset();
    int indexSize = bytes < POSISTIONS ? bytes : POSISTIONS;

    for (int pos = 0; pos < indexSize; pos++) {
        byte c = Platform.getByte(valueBase, valueOffset + pos);
        // The ESCAPE_CHARACTOR case can be optimized. But I'm too lazy...
        if (c == '%' || c == ESCAPE_CHARACTOR) {
            break;
        }
        if (c != '_' && !isSet(packAddr, c, pos)) {
            return RSValue.None;
        }
    }
    return RSValue.Some;
}
 
Example #4
Source File: VersionAdapter_VLT.java    From indexr with Apache License 2.0 6 votes vote down vote up
@Override
public PackBundle createPackBundle(int version, SegmentMode mode, byte dataType, boolean isIndexed, VirtualDataPack cache) {
    Object values = cache.cacheValues();
    int size = cache.valueCount();
    switch (dataType) {
        case ColumnType.INT:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (int[]) values, 0, size);
        case ColumnType.LONG:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (long[]) values, 0, size);
        case ColumnType.FLOAT:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (float[]) values, 0, size);
        case ColumnType.DOUBLE:
            return DataPackCreator_VLT.from(version, mode, isIndexed, (double[]) values, 0, size);
        case ColumnType.STRING:
            return DataPackCreator_VLT.from(version, mode, isIndexed, Arrays.asList((UTF8String[]) values).subList(0, size));
        default:
            throw new IllegalArgumentException(String.format("Not support data type of %s", dataType));
    }
}
 
Example #5
Source File: ExtIndex_DictBits.java    From indexr with Apache License 2.0 6 votes vote down vote up
@Override
public BitMap like(Column column, int packId, long numValue, UTF8String strValue) throws IOException {
    DataPack pack = column.pack(packId);
    int rowCount = pack.valueCount();
    BitMap res = new BitMap();
    // TODO optimize for like xxx%
    switch (dataType) {
        case ColumnType.STRING: {
            for (int rowId = 0; rowId < rowCount; rowId++) {
                if (SQLLike.match(pack.stringValueAt(rowId), strValue)) {
                    res.set(rowId);
                }
            }
            break;
        }
        default:
            throw new IllegalStateException("column type " + dataType + " is illegal in LIKE");
    }
    return fixBitmapInPack(res, rowCount);
}
 
Example #6
Source File: RSIndex_CMap_V2.java    From indexr with Apache License 2.0 6 votes vote down vote up
static void _putValue(long packAddr, UTF8String value) {
    int valueSize = value.numBytes();
    Object valueBase = value.getBaseObject();
    long valueOffset = value.getBaseOffset();

    long offset = packAddr + indexOffsetBySize(valueSize);
    int checkSize = valueSize < MAX_POSISTIONS ? valueSize : MAX_POSISTIONS;

    if (checkSize == 0) {
        // mark empty string exists.
        set(offset, (byte) 1, 0);
    } else {
        for (int pos = 0; pos < checkSize; pos++) {
            set(offset, Platform.getByte(valueBase, valueOffset + pos), pos);
        }
    }
}
 
Example #7
Source File: VirtualDataPack.java    From indexr with Apache License 2.0 6 votes vote down vote up
private static Object allocateValues(byte dataType, int cap) {
    switch (dataType) {
        case ColumnType.INT:
            return new int[cap];
        case ColumnType.LONG:
            return new long[cap];
        case ColumnType.FLOAT:
            return new float[cap];
        case ColumnType.DOUBLE:
            return new double[cap];
        case ColumnType.STRING:
            return new UTF8String[cap];
        default:
            throw new IllegalArgumentException(String.format("Not support data type of %s", dataType));
    }
}
 
Example #8
Source File: ArrowSchemaConverter.java    From spark-bigquery-connector with Apache License 2.0 6 votes vote down vote up
@Override
final UTF8String getUTF8String(int rowId) {
  accessor.get(rowId, stringResult);
  if (stringResult.isSet == 0) {
    return null;
  } else {
    ArrowBuf offsets = accessor.getOffsetBuffer();
    int index = rowId * VarCharVector.OFFSET_WIDTH;
    int start = offsets.getInt(index);
    int end = offsets.getInt(index + VarCharVector.OFFSET_WIDTH);

    /* Since the result is accessed lazily if the memory address is corrupted we
     * might lose the data. Might be better to include a byte array. Not doing so
     * for performance reasons.
     */
    return UTF8String.fromAddress(/* base = */null,
        stringResult.buffer.memoryAddress() + start,
        end - start);
  }
}
 
Example #9
Source File: RoughCheck_R.java    From indexr with Apache License 2.0 6 votes vote down vote up
public static byte inCheckOnPack(Column column, int packId, UTF8String[] values) throws IOException {
    RSIndexStr index = column.rsIndex();
    boolean none = true;
    for (UTF8String value : values) {
        byte vRes = index.isValue(packId, value);
        if (vRes == All) {
            return All;
        } else if (vRes == Some) {
            none = false;
            // There are very little chances be All after Some, so jump out fast.
            break;
        }
    }
    if (none) {
        return None;
    } else {
        return Some;
    }
}
 
Example #10
Source File: RSIndex_CMap.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override
public byte isValue(int packId, UTF8String value) {
    assert packId >= 0 && packId < packCount;

    long packAddr = bufferAddr + (packId * (POSISTIONS << POSITION_BYTE_SHIFT));
    return _isValue(packAddr, value);
}
 
Example #11
Source File: ExtIndex_DictBits.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override
public BitMap between(Column column, int packId, long numValue1, long numValue2, UTF8String strValue1, UTF8String strValue2) throws IOException {
    DataPack pack = column.pack(packId);
    DataPackNode dpn = column.dpn(packId);
    DictStruct struct = pack.dictStruct(dataType, dpn);
    int entryId1 = searchEntry(struct, numValue1, strValue1);
    int entryId2 = searchEntry(struct, numValue2, strValue2);

    int dictEntryCount = struct.dictEntryCount();

    // [from, to)
    int from = entryId1 >= 0 ? entryId1 : (-entryId1 - 1);
    int to = entryId2 >= 0 ? entryId2 + 1 : (-entryId2 - 1);

    Preconditions.checkState(from >= 0 && from < dictEntryCount);
    Preconditions.checkState(to >= 0 && to <= dictEntryCount);

    if (to - from <= 0) {
        return BitMap.NONE;
    } else if (to - from >= dictEntryCount) {
        return BitMap.ALL;
    }

    long dataAddr = struct.dataAddr();
    BitMap bitSet = new BitMap();
    for (int i = 0; i < dpn.objCount(); i++) {
        int id = MemoryUtil.getInt(dataAddr + (i << 2));
        if (id >= from && id < to) {
            bitSet.set(i);
        }
    }
    return bitSet;
}
 
Example #12
Source File: ColCmpVal.java    From indexr with Apache License 2.0 5 votes vote down vote up
public ColCmpVal(Attr attr,
                 long numValue,
                 UTF8String strValue) {
    this.attr = attr;
    this.numValue = numValue;
    this.strValue = strValue;
}
 
Example #13
Source File: Between.java    From indexr with Apache License 2.0 5 votes vote down vote up
@JsonCreator
public Between(@JsonProperty("attr") Attr attr,
               @JsonProperty("numValue1") long numValue1,
               @JsonProperty("numValue2") long numValue2,
               @JsonProperty("strValue1") String strValue1,
               @JsonProperty("strValue2") String strValue2) {
    this(attr, numValue1, numValue2,
            strValue1 == null ? null : UTF8String.fromString(strValue1),
            strValue2 == null ? null : UTF8String.fromString(strValue2));
}
 
Example #14
Source File: ExtIndex_DictBits.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override
public BitMap in(Column column, int packId, long[] numValues, UTF8String[] strValues) throws IOException {
    DataPack pack = column.pack(packId);
    DataPackNode dpn = column.dpn(packId);
    DictStruct struct = pack.dictStruct(dataType, dpn);
    int inCount = ColumnType.isNumber(dataType) ? numValues.length : strValues.length;
    int[] entryIds = new int[inCount];
    int entryIdCount = 0;
    for (int i = 0; i < inCount; i++) {
        long numValue = numValues == null ? 0 : numValues[i];
        UTF8String strValue = strValues == null ? null : strValues[i];
        int entryId = searchEntry(struct, numValue, strValue);
        if (entryId >= 0) {
            entryIds[entryIdCount++] = entryId;
        }
    }

    if (entryIdCount == 0) {
        return BitMap.NONE;
    }

    long dataAddr = struct.dataAddr();
    BitMap bitSet = new BitMap();
    for (int i = 0; i < dpn.objCount(); i++) {
        int vid = MemoryUtil.getInt(dataAddr + (i << 2));
        for (int eId = 0; eId < entryIdCount; eId++) {
            if (vid == entryIds[eId]) {
                bitSet.set(i);
            }
        }
    }
    return bitSet;
}
 
Example #15
Source File: BHCompressTest.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Test
public void test_string() {
    int item_size = 65536;
    List<UTF8String> strings = new ArrayList<>();
    for (int i = 0; i < item_size; i++) {
        strings.add(UTF8String.fromString(RandomStringUtils.random(20)));
    }
    ByteSlice bb = _from_v1(strings);
    ByteSlice cmp = BHCompressor.compressIndexedStr_v1(bb, item_size);
    ByteSlice dp = BHCompressor.decompressIndexedStr_v1(cmp, item_size);

    Assert.assertEquals(true, ByteSlice.checkEquals(bb, dp));
}
 
Example #16
Source File: ExtIndex_DictBits.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override
public BitMap greater(Column column, int packId, long numValue, UTF8String strValue, boolean acceptEqual) throws IOException {
    DataPack pack = column.pack(packId);
    DataPackNode dpn = column.dpn(packId);
    DictStruct struct = pack.dictStruct(dataType, dpn);

    int entryId = searchEntry(struct, numValue, strValue);

    int start;
    if (entryId >= 0) {
        start = acceptEqual ? entryId : entryId + 1;
    } else {
        start = -entryId - 1;
    }
    Preconditions.checkState(start >= 0);

    if (start >= struct.dictEntryCount()) {
        return BitMap.NONE;
    } else if (start == 0) {
        return BitMap.ALL;
    }

    long dataAddr = struct.dataAddr();
    BitMap bitSet = new BitMap();
    for (int i = 0; i < dpn.objCount(); i++) {
        if (MemoryUtil.getInt(dataAddr + (i << 2)) >= start) {
            bitSet.set(i);
        }
    }
    return bitSet;
}
 
Example #17
Source File: FlightArrowColumnVector.java    From flight-spark-source with Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String getUTF8String(int rowId) {
  if (isNullAt(rowId)) {
    return null;
  }
  return accessor.getUTF8String(rowId);
}
 
Example #18
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Object defensiveCopyIfNeeded(Object obj) {
  if (obj instanceof UTF8String) {
    // bytes backing the UTF8 string might be reused
    byte[] bytes = ((UTF8String) obj).getBytes();
    return UTF8String.fromBytes(Arrays.copyOf(bytes, bytes.length));
  }
  return obj;
}
 
Example #19
Source File: GenericMutableRow.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override public void setString(int ordinal, UTF8String value) {
    if(stringValues == null){
        stringValues = new ArrayList<>();
    }
    int index = stringValues.size();
    stringValues.add(value);
    setLong(ordinal, index);
}
 
Example #20
Source File: SparkValueWriters.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public void write(UTF8String s, Encoder encoder) throws IOException {
  // use getBytes because it may return the backing byte array if available.
  // otherwise, it copies to a new byte array, which is still cheaper than Avro
  // calling toString, which incurs encoding costs
  encoder.writeString(new Utf8(s.getBytes()));
}
 
Example #21
Source File: SparkValueReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(Decoder decoder, Object reuse) throws IOException {
  ByteBuffer buffer = BUFFER.get();
  buffer.rewind();

  decoder.readFixed(buffer.array(), 0, 16);
  long mostSigBits = buffer.getLong();
  long leastSigBits = buffer.getLong();

  return UTF8String.fromString(new UUID(mostSigBits, leastSigBits).toString());
}
 
Example #22
Source File: SparkParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(UTF8String ignored) {
  Binary binary = column.nextBinary();
  ByteBuffer buffer = binary.toByteBuffer();
  if (buffer.hasArray()) {
    return UTF8String.fromBytes(
        buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
  } else {
    return UTF8String.fromBytes(binary.getBytes());
  }
}
 
Example #23
Source File: SparkValueReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(Decoder decoder, Object reuse) throws IOException {
  ByteBuffer buffer = BUFFER.get();
  buffer.rewind();

  decoder.readFixed(buffer.array(), 0, 16);
  long mostSigBits = buffer.getLong();
  long leastSigBits = buffer.getLong();

  return UTF8String.fromString(new UUID(mostSigBits, leastSigBits).toString());
}
 
Example #24
Source File: StructInternalRow.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private ArrayData collectionToArrayData(Type elementType, Collection<?> values) {
  switch (elementType.typeId()) {
    case BOOLEAN:
    case INTEGER:
    case DATE:
    case TIME:
    case LONG:
    case TIMESTAMP:
    case FLOAT:
    case DOUBLE:
      return fillArray(values, array -> (pos, value) -> array[pos] = value);
    case STRING:
      return fillArray(values, array ->
          (BiConsumer<Integer, CharSequence>) (pos, seq) -> array[pos] = UTF8String.fromString(seq.toString()));
    case FIXED:
    case BINARY:
      return fillArray(values, array ->
          (BiConsumer<Integer, ByteBuffer>) (pos, buf) -> array[pos] = ByteBuffers.toByteArray(buf));
    case DECIMAL:
      return fillArray(values, array ->
          (BiConsumer<Integer, BigDecimal>) (pos, dec) -> array[pos] = Decimal.apply(dec));
    case STRUCT:
      return fillArray(values, array -> (BiConsumer<Integer, StructLike>) (pos, tuple) ->
          array[pos] = new StructInternalRow(elementType.asStructType(), tuple));
    case LIST:
      return fillArray(values, array -> (BiConsumer<Integer, Collection<?>>) (pos, list) ->
          array[pos] = collectionToArrayData(elementType.asListType(), list));
    case MAP:
      return fillArray(values, array -> (BiConsumer<Integer, Map<?, ?>>) (pos, map) ->
          array[pos] = mapToMapData(elementType.asMapType(), map));
    default:
      throw new UnsupportedOperationException("Unsupported array element type: " + elementType);
  }
}
 
Example #25
Source File: PartitionKey.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Object defensiveCopyIfNeeded(Object obj) {
  if (obj instanceof UTF8String) {
    // bytes backing the UTF8 string might be reused
    byte[] bytes = ((UTF8String) obj).getBytes();
    return UTF8String.fromBytes(Arrays.copyOf(bytes, bytes.length));
  }
  return obj;
}
 
Example #26
Source File: UnsafeRow.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override
public void setString(int ordinal, UTF8String value) {
    if (stringValues == null) {
        stringValues = new ArrayList<>();
    }
    int index = stringValues.size();
    stringValues.add(value);
    setLong(ordinal, index);
}
 
Example #27
Source File: SparkExpressions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Object valueFromSpark(Literal lit) {
  if (lit.value() instanceof UTF8String) {
    return lit.value().toString();
  } else if (lit.value() instanceof Decimal) {
    return ((Decimal) lit.value()).toJavaBigDecimal();
  }
  return lit.value();
}
 
Example #28
Source File: OuterIndex_Inverted.java    From indexr with Apache License 2.0 5 votes vote down vote up
@Override
public BitMap in(Column column, long[] numValues, UTF8String[] strValues, boolean isNot) throws IOException {
    int inCount = ColumnType.isNumber(dataType) ? numValues.length : strValues.length;
    int[] entryIds = new int[inCount];
    int entryIdCount = 0;
    for (int i = 0; i < inCount; i++) {
        long numValue = numValues == null ? 0 : numValues[i];
        UTF8String strValue = strValues == null ? null : strValues[i];
        int entryId = searchEntry(numValue, strValue);
        if (entryId >= 0) {
            entryIds[entryIdCount++] = entryId;
        }
    }

    if (entryIdCount == 0) {
        return isNot ? BitMap.ALL : BitMap.NONE;
    } else if (dictEntryCount == 1) {
        return isNot ? BitMap.NONE : BitMap.ALL;
    }

    if (isNot) {
        // "not in" can not be handled by this index.
        return BitMap.ALL;
    }

    DirectBitMap bitmap = new DirectBitMap(packCount);
    MergeBitMapUtil.readAndMergeBitmaps(bitmapReader, bitmap, Trick.subArray(entryIds, 0, entryIdCount));
    return new BitMap(bitmap, packCount);
}
 
Example #29
Source File: In.java    From indexr with Apache License 2.0 5 votes vote down vote up
static UTF8String[] toUTF8Arr(String[] strValues) {
    if (strValues == null) {
        return null;
    }
    UTF8String[] vs = new UTF8String[strValues.length];
    for (int i = 0; i < strValues.length; i++) {
        vs[i] = strValues[i] == null ? null : UTF8String.fromString(strValues[i]);
    }
    return vs;
}
 
Example #30
Source File: SparkParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public UTF8String read(UTF8String ignored) {
  Binary binary = column.nextBinary();
  ByteBuffer buffer = binary.toByteBuffer();
  if (buffer.hasArray()) {
    return UTF8String.fromBytes(
        buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
  } else {
    return UTF8String.fromBytes(binary.getBytes());
  }
}