Java Code Examples for java.nio.charset.CharsetEncoder#maxBytesPerChar()

The following examples show how to use java.nio.charset.CharsetEncoder#maxBytesPerChar() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StringUtils.java    From tajo with Apache License 2.0 6 votes vote down vote up
public static byte[] convertCharsToBytes(char[] src, Charset charset) {
  CharsetEncoder encoder = charset.newEncoder();
  byte[] resultArray = new byte[(int) (src.length * encoder.maxBytesPerChar())];
  
  if (src.length != 0) {
    CharBuffer charBuffer = CharBuffer.wrap(src);
    ByteBuffer byteBuffer = ByteBuffer.wrap(resultArray);
    
    encoder.onMalformedInput(CodingErrorAction.REPLACE).onUnmappableCharacter(CodingErrorAction.REPLACE);
    encoder.reset();
    
    CoderResult coderResult = encoder.encode(charBuffer, byteBuffer, true);
    if (coderResult.isUnderflow()) {
      coderResult = encoder.flush(byteBuffer);
      
      if (coderResult.isUnderflow()) {
        if (resultArray.length != byteBuffer.position()) {
          resultArray = Arrays.copyOf(resultArray, byteBuffer.position());
        }
      }
    }
  }
  
  return resultArray;
}
 
Example 2
Source File: ZipCoder.java    From openjdk-jdk8u-backup with GNU General Public License v2.0 6 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 3
Source File: ZipCoder.java    From TencentKona-8 with GNU General Public License v2.0 6 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 4
Source File: ZipCoder.java    From jdk8u-dev-jdk with GNU General Public License v2.0 6 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 5
Source File: ZipCoder.java    From jdk8u_jdk with GNU General Public License v2.0 6 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 6
Source File: ZipCoder.java    From jdk8u60 with GNU General Public License v2.0 6 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 7
Source File: ZipCoder.java    From jdk8u-jdk with GNU General Public License v2.0 6 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 8
Source File: ZipCoder.java    From JDKSourceCode1.8 with MIT License 5 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    // UTF-8 only for now. Other ArrayDeocder only handles
    // CodingErrorAction.REPLACE mode.
    if (isUTF8 && ce instanceof ArrayEncoder) {
        int blen = ((ArrayEncoder)ce).encode(ca, 0, ca.length, ba);
        if (blen == -1)    // malformed
            throw new IllegalArgumentException("MALFORMED");
        return Arrays.copyOf(ba, blen);
    }
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 9
Source File: ByteBufUtil.java    From netty4.0.27Learn with Apache License 2.0 5 votes vote down vote up
static ByteBuf encodeString0(ByteBufAllocator alloc, boolean enforceHeap, CharBuffer src, Charset charset) {
    final CharsetEncoder encoder = CharsetUtil.getEncoder(charset);
    int length = (int) ((double) src.remaining() * encoder.maxBytesPerChar());
    boolean release = true;
    final ByteBuf dst;
    if (enforceHeap) {
        dst = alloc.heapBuffer(length);
    } else {
        dst = alloc.buffer(length);
    }
    try {
        final ByteBuffer dstBuf = dst.internalNioBuffer(0, length);
        final int pos = dstBuf.position();
        CoderResult cr = encoder.encode(src, dstBuf, true);
        if (!cr.isUnderflow()) {
            cr.throwException();
        }
        cr = encoder.flush(dstBuf);
        if (!cr.isUnderflow()) {
            cr.throwException();
        }
        dst.writerIndex(dst.writerIndex() + dstBuf.position() - pos);
        release = false;
        return dst;
    } catch (CharacterCodingException x) {
        throw new IllegalStateException(x);
    } finally {
        if (release) {
            dst.release();
        }
    }
}
 
Example 10
Source File: LogHandlerAccessor.java    From trufflesqueak with MIT License 5 votes vote down vote up
@Override
public void publish(final LogRecord record) {
    final String message = record.getMessage();
    if (message == null) {
        return;
    }
    final CharsetEncoder encoder = ThreadLocalCoders.encoderFor(StandardCharsets.UTF_8);
    if (buffer.position() + 1 + message.length() * encoder.maxBytesPerChar() >= GIG) {
        close();
        initializeMappedBuffer();
    }
    encoder.encode(CharBuffer.wrap(message), buffer, true);
    encoder.flush(buffer);
    buffer.put((byte) 10);
}
 
Example 11
Source File: DataBuffer.java    From spring-analysis-note with MIT License 5 votes vote down vote up
/**
 * Write the given {@code CharSequence} using the given {@code Charset},
 * starting at the current writing position.
 * @param charSequence the char sequence to write into this buffer
 * @param charset the charset to encode the char sequence with
 * @return this buffer
 * @since 5.1.4
 */
default DataBuffer write(CharSequence charSequence, Charset charset) {
	Assert.notNull(charSequence, "CharSequence must not be null");
	Assert.notNull(charset, "Charset must not be null");
	if (charSequence.length() != 0) {
		CharsetEncoder charsetEncoder = charset.newEncoder()
				.onMalformedInput(CodingErrorAction.REPLACE)
				.onUnmappableCharacter(CodingErrorAction.REPLACE);
		CharBuffer inBuffer = CharBuffer.wrap(charSequence);
		int estimatedSize = (int) (inBuffer.remaining() * charsetEncoder.averageBytesPerChar());
		ByteBuffer outBuffer = ensureCapacity(estimatedSize)
				.asByteBuffer(writePosition(), writableByteCount());
		while (true) {
			CoderResult cr = (inBuffer.hasRemaining() ?
					charsetEncoder.encode(inBuffer, outBuffer, true) : CoderResult.UNDERFLOW);
			if (cr.isUnderflow()) {
				cr = charsetEncoder.flush(outBuffer);
			}
			if (cr.isUnderflow()) {
				break;
			}
			if (cr.isOverflow()) {
				writePosition(writePosition() + outBuffer.position());
				int maximumSize = (int) (inBuffer.remaining() * charsetEncoder.maxBytesPerChar());
				ensureCapacity(maximumSize);
				outBuffer = asByteBuffer(writePosition(), writableByteCount());
			}
		}
		writePosition(writePosition() + outBuffer.position());
	}
	return this;
}
 
Example 12
Source File: ZipCoder.java    From Java8CN with Apache License 2.0 5 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    // UTF-8 only for now. Other ArrayDeocder only handles
    // CodingErrorAction.REPLACE mode.
    if (isUTF8 && ce instanceof ArrayEncoder) {
        int blen = ((ArrayEncoder)ce).encode(ca, 0, ca.length, ba);
        if (blen == -1)    // malformed
            throw new IllegalArgumentException("MALFORMED");
        return Arrays.copyOf(ba, blen);
    }
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 13
Source File: DataBuffer.java    From java-technology-stack with MIT License 5 votes vote down vote up
/**
 * Write the given {@code CharSequence} using the given {@code Charset},
 * starting at the current writing position.
 * @param charSequence the char sequence to write into this buffer
 * @param charset the charset to encode the char sequence with
 * @return this buffer
 * @since 5.1.4
 */
default DataBuffer write(CharSequence charSequence, Charset charset) {
	Assert.notNull(charSequence, "CharSequence must not be null");
	Assert.notNull(charset, "Charset must not be null");
	if (charSequence.length() != 0) {
		CharsetEncoder charsetEncoder = charset.newEncoder()
				.onMalformedInput(CodingErrorAction.REPLACE)
				.onUnmappableCharacter(CodingErrorAction.REPLACE);
		CharBuffer inBuffer = CharBuffer.wrap(charSequence);
		int estimatedSize = (int) (inBuffer.remaining() * charsetEncoder.averageBytesPerChar());
		ByteBuffer outBuffer = ensureCapacity(estimatedSize)
				.asByteBuffer(writePosition(), writableByteCount());
		while (true) {
			CoderResult cr = (inBuffer.hasRemaining() ?
					charsetEncoder.encode(inBuffer, outBuffer, true) : CoderResult.UNDERFLOW);
			if (cr.isUnderflow()) {
				cr = charsetEncoder.flush(outBuffer);
			}
			if (cr.isUnderflow()) {
				break;
			}
			if (cr.isOverflow()) {
				writePosition(outBuffer.position());
				int maximumSize = (int) (inBuffer.remaining() * charsetEncoder.maxBytesPerChar());
				ensureCapacity(maximumSize);
				outBuffer = asByteBuffer(writePosition(), writableByteCount());
			}
		}
		writePosition(outBuffer.position());
	}
	return this;
}
 
Example 14
Source File: ZipCoder.java    From jdk8u_jdk with GNU General Public License v2.0 5 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    // UTF-8 only for now. Other ArrayDeocder only handles
    // CodingErrorAction.REPLACE mode.
    if (isUTF8 && ce instanceof ArrayEncoder) {
        int blen = ((ArrayEncoder)ce).encode(ca, 0, ca.length, ba);
        if (blen == -1)    // malformed
            throw new IllegalArgumentException("MALFORMED");
        return Arrays.copyOf(ba, blen);
    }
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 15
Source File: ZipCoder.java    From jdk8u-dev-jdk with GNU General Public License v2.0 5 votes vote down vote up
byte[] getBytes(String s) {
    CharsetEncoder ce = encoder().reset();
    char[] ca = s.toCharArray();
    int len = (int)(ca.length * ce.maxBytesPerChar());
    byte[] ba = new byte[len];
    if (len == 0)
        return ba;
    // UTF-8 only for now. Other ArrayDeocder only handles
    // CodingErrorAction.REPLACE mode.
    if (isUTF8 && ce instanceof ArrayEncoder) {
        int blen = ((ArrayEncoder)ce).encode(ca, 0, ca.length, ba);
        if (blen == -1)    // malformed
            throw new IllegalArgumentException("MALFORMED");
        return Arrays.copyOf(ba, blen);
    }
    ByteBuffer bb = ByteBuffer.wrap(ba);
    CharBuffer cb = CharBuffer.wrap(ca);
    CoderResult cr = ce.encode(cb, bb, true);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    cr = ce.flush(bb);
    if (!cr.isUnderflow())
        throw new IllegalArgumentException(cr.toString());
    if (bb.position() == ba.length)  // defensive copy?
        return ba;
    else
        return Arrays.copyOf(ba, bb.position());
}
 
Example 16
Source File: ReversedLinesFileReader.java    From aion-germany with GNU General Public License v3.0 4 votes vote down vote up
/**
 * Creates a ReversedLinesFileReader with the given block size and encoding.
 *
 * @param file
 *            the file to be read
 * @param blockSize
 *            size of the internal buffer (for ideal performance this should
 *            match with the block size of the underlying file system).
 * @param encoding
 *            the encoding of the file
 * @throws IOException  if an I/O error occurs
 * @since 2.3
 */
public ReversedLinesFileReader(final File file, final int blockSize, final Charset encoding) throws IOException {
    this.blockSize = blockSize;
    this.encoding = encoding;

    randomAccessFile = new RandomAccessFile(file, "r");
    totalByteLength = randomAccessFile.length();
    int lastBlockLength = (int) (totalByteLength % blockSize);
    if (lastBlockLength > 0) {
        totalBlockCount = totalByteLength / blockSize + 1;
    } else {
        totalBlockCount = totalByteLength / blockSize;
        if (totalByteLength > 0) {
            lastBlockLength = blockSize;
        }
    }
    currentFilePart = new FilePart(totalBlockCount, lastBlockLength, null);

    // --- check & prepare encoding ---
    Charset charset = Charsets.toCharset(encoding);
    CharsetEncoder charsetEncoder = charset.newEncoder();
    float maxBytesPerChar = charsetEncoder.maxBytesPerChar();
    if(maxBytesPerChar==1f) {
        // all one byte encodings are no problem
        byteDecrement = 1;
    } else if(charset == Charset.forName("UTF-8")) {
        // UTF-8 works fine out of the box, for multibyte sequences a second UTF-8 byte can never be a newline byte
        // http://en.wikipedia.org/wiki/UTF-8
        byteDecrement = 1;
    } else if(charset == Charset.forName("Shift_JIS")) {
        // Same as for UTF-8
        // http://www.herongyang.com/Unicode/JIS-Shift-JIS-Encoding.html
        byteDecrement = 1;
    } else if(charset == Charset.forName("UTF-16BE") || charset == Charset.forName("UTF-16LE")) {
        // UTF-16 new line sequences are not allowed as second tuple of four byte sequences,
        // however byte order has to be specified
        byteDecrement = 2;
    } else if(charset == Charset.forName("UTF-16")) {
        throw new UnsupportedEncodingException(
                "For UTF-16, you need to specify the byte order (use UTF-16BE or UTF-16LE)");
    } else {
        throw new UnsupportedEncodingException(
                "Encoding "+encoding+" is not supported yet (feel free to submit a patch)");
    }
    // NOTE: The new line sequences are matched in the order given, so it is important that \r\n is BEFORE \n
    newLineSequences = new byte[][] { "\r\n".getBytes(encoding), "\n".getBytes(encoding), "\r".getBytes(encoding) };

    avoidNewlineSplitBufferSize = newLineSequences[0].length;
}
 
Example 17
Source File: GridReversedLinesFileReader.java    From ignite with Apache License 2.0 4 votes vote down vote up
/**
 * Creates a ReverseLineReader with the given block size and encoding.
 *
 * @param file
 *            the file to be read
 * @param blockSize
 *            size of the internal buffer (for ideal performance this should
 *            match with the block size of the underlying file system).
 * @param charset
 *            the encoding of the file
 * @throws IOException  if an I/O error occurs
 * @since 2.3
 */
public GridReversedLinesFileReader(final File file, final int blockSize, final Charset charset) throws IOException {
    this.blockSize = blockSize;
    this.encoding = charset;

    randomAccessFile = new RandomAccessFile(file, "r");
    totalByteLength = randomAccessFile.length();
    int lastBlockLength = (int) (totalByteLength % blockSize);
    if (lastBlockLength > 0) {
        totalBlockCount = totalByteLength / blockSize + 1;
    } else {
        totalBlockCount = totalByteLength / blockSize;
        if (totalByteLength > 0) {
            lastBlockLength = blockSize;
        }
    }
    currentFilePart = new FilePart(totalBlockCount, lastBlockLength, null);

    // --- check & prepare encoding ---
    CharsetEncoder charsetEncoder = charset.newEncoder();
    float maxBytesPerChar = charsetEncoder.maxBytesPerChar();
    if (maxBytesPerChar == 1f) {
        // all one byte encodings are no problem
        byteDecrement = 1;
    } else if (charset == Charset.forName("UTF-8")) {
        // UTF-8 works fine out of the box, for multibyte sequences a second UTF-8 byte can never be a newline byte
        // http://en.wikipedia.org/wiki/UTF-8
        byteDecrement = 1;
    } else if (charset == Charset.forName("Shift_JIS")) {
        // Same as for UTF-8
        // http://www.herongyang.com/Unicode/JIS-Shift-JIS-Encoding.html
        byteDecrement = 1;
    } else if (charset == Charset.forName("UTF-16BE") || charset == Charset.forName("UTF-16LE")) {
        // UTF-16 new line sequences are not allowed as second tuple of four byte sequences,
        // however byte order has to be specified
        byteDecrement = 2;
    } else if (charset == Charset.forName("UTF-16")) {
        throw new UnsupportedEncodingException(
            "For UTF-16, you need to specify the byte order (use UTF-16BE or UTF-16LE)");
    } else {
        throw new UnsupportedEncodingException(
            "Encoding " + charset + " is not supported yet (feel free to submit a patch)");
    }
    // NOTE: The new line sequences are matched in the order given, so it is important that \r\n is BEFORE \n
    newLineSequences = new byte[][] {"\r\n".getBytes(charset), "\n".getBytes(charset), "\r".getBytes(charset)};

    avoidNewlineSplitBufferSize = newLineSequences[0].length;
}
 
Example 18
Source File: IsValidUtf8TestUtil.java    From travelguide with Apache License 2.0 4 votes vote down vote up
/**
 * Variation of {@link #testBytes} that does less allocation using the
 * low-level encoders/decoders directly. Checked in because it's useful for
 * debugging when trying to process bytes faster, but since it doesn't use the
 * actual String class, it's possible for incompatibilities to develop
 * (although unlikely).
 *
 * @param numBytes the number of bytes in the byte array
 * @param expectedCount the expected number of roundtrippable permutations
 * @param start the starting bytes encoded as a long as big-endian
 * @param lim the limit of bytes to process encoded as a long as big-endian,
 *     or -1 to mean the max limit for numBytes
 */
void testBytesUsingByteBuffers(
    int numBytes, long expectedCount, long start, long lim)
    throws UnsupportedEncodingException {
  CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder()
      .onMalformedInput(CodingErrorAction.REPLACE)
      .onUnmappableCharacter(CodingErrorAction.REPLACE);
  CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder()
      .onMalformedInput(CodingErrorAction.REPLACE)
      .onUnmappableCharacter(CodingErrorAction.REPLACE);
  byte[] bytes = new byte[numBytes];
  int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
  char[] charsDecoded =
      new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
  int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
  byte[] bytesReencoded = new byte[maxBytes];

  ByteBuffer bb = ByteBuffer.wrap(bytes);
  CharBuffer cb = CharBuffer.wrap(charsDecoded);
  ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
  if (lim == -1) {
    lim = 1L << (numBytes * 8);
  }
  long count = 0;
  long countRoundTripped = 0;
  for (long byteChar = start; byteChar < lim; byteChar++) {
    bb.rewind();
    bb.limit(bytes.length);
    cb.rewind();
    cb.limit(charsDecoded.length);
    bbReencoded.rewind();
    bbReencoded.limit(bytesReencoded.length);
    encoder.reset();
    decoder.reset();
    long tmpByteChar = byteChar;
    for (int i = 0; i < bytes.length; i++) {
      bytes[bytes.length - i - 1] = (byte) tmpByteChar;
      tmpByteChar = tmpByteChar >> 8;
    }
    boolean isRoundTrippable = ByteString.copyFrom(bytes).isValidUtf8();
    CoderResult result = decoder.decode(bb, cb, true);
    assertFalse(result.isError());
    result = decoder.flush(cb);
    assertFalse(result.isError());

    int charLen = cb.position();
    cb.rewind();
    cb.limit(charLen);
    result = encoder.encode(cb, bbReencoded, true);
    assertFalse(result.isError());
    result = encoder.flush(bbReencoded);
    assertFalse(result.isError());

    boolean bytesEqual = true;
    int bytesLen = bbReencoded.position();
    if (bytesLen != numBytes) {
      bytesEqual = false;
    } else {
      for (int i = 0; i < numBytes; i++) {
        if (bytes[i] != bytesReencoded[i]) {
          bytesEqual = false;
          break;
        }
      }
    }
    if (bytesEqual != isRoundTrippable) {
      outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
    }

    count++;
    if (isRoundTrippable) {
      countRoundTripped++;
    }
    if (byteChar != 0 && byteChar % 1000000 == 0) {
      logger.info("Processed " + (byteChar / 1000000) +
          " million characters");
    }
  }
  logger.info("Round tripped " + countRoundTripped + " of " + count);
  assertEquals(expectedCount, countRoundTripped);
}
 
Example 19
Source File: ReversedLinesFileReader.java    From lams with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Creates a ReversedLinesFileReader with the given block size and encoding.
 *
 * @param file
 *            the file to be read
 * @param blockSize
 *            size of the internal buffer (for ideal performance this should
 *            match with the block size of the underlying file system).
 * @param encoding
 *            the encoding of the file
 * @throws IOException  if an I/O error occurs
 * @since 2.3
 */
@SuppressWarnings("deprecation") // unavoidable until Java 7
public ReversedLinesFileReader(final File file, final int blockSize, final Charset encoding) throws IOException {
    this.blockSize = blockSize;
    this.encoding = encoding;

    // --- check & prepare encoding ---
    final Charset charset = Charsets.toCharset(encoding);
    final CharsetEncoder charsetEncoder = charset.newEncoder();
    final float maxBytesPerChar = charsetEncoder.maxBytesPerChar();
    if (maxBytesPerChar == 1f) {
        // all one byte encodings are no problem
        byteDecrement = 1;
    } else if (charset == Charsets.UTF_8) {
        // UTF-8 works fine out of the box, for multibyte sequences a second UTF-8 byte can never be a newline byte
        // http://en.wikipedia.org/wiki/UTF-8
        byteDecrement = 1;
    } else if(charset == Charset.forName("Shift_JIS") || // Same as for UTF-8
            // http://www.herongyang.com/Unicode/JIS-Shift-JIS-Encoding.html
            charset == Charset.forName("windows-31j") || // Windows code page 932 (Japanese)
            charset == Charset.forName("x-windows-949") || // Windows code page 949 (Korean)
            charset == Charset.forName("gbk") || // Windows code page 936 (Simplified Chinese)
            charset == Charset.forName("x-windows-950")) { // Windows code page 950 (Traditional Chinese)
        byteDecrement = 1;
    } else if (charset == Charsets.UTF_16BE || charset == Charsets.UTF_16LE) {
        // UTF-16 new line sequences are not allowed as second tuple of four byte sequences,
        // however byte order has to be specified
        byteDecrement = 2;
    } else if (charset == Charsets.UTF_16) {
        throw new UnsupportedEncodingException("For UTF-16, you need to specify the byte order (use UTF-16BE or " +
                "UTF-16LE)");
    } else {
        throw new UnsupportedEncodingException("Encoding " + encoding + " is not supported yet (feel free to " +
                "submit a patch)");
    }

    // NOTE: The new line sequences are matched in the order given, so it is important that \r\n is BEFORE \n
    newLineSequences = new byte[][] { "\r\n".getBytes(encoding), "\n".getBytes(encoding), "\r".getBytes(encoding) };

    avoidNewlineSplitBufferSize = newLineSequences[0].length;

    // Open file
    randomAccessFile = new RandomAccessFile(file, "r");
    totalByteLength = randomAccessFile.length();
    int lastBlockLength = (int) (totalByteLength % blockSize);
    if (lastBlockLength > 0) {
        totalBlockCount = totalByteLength / blockSize + 1;
    } else {
        totalBlockCount = totalByteLength / blockSize;
        if (totalByteLength > 0) {
            lastBlockLength = blockSize;
        }
    }
    currentFilePart = new FilePart(totalBlockCount, lastBlockLength, null);

}
 
Example 20
Source File: PerforceShiftJISCharset.java    From p4ic4idea with Apache License 2.0 4 votes vote down vote up
/**
 * Call the superclass constructor with the Charset object and the
 * encodings sizes from the encoder.
 */
Encoder(Charset cs, CharsetEncoder encoder) {
	super(cs, encoder.averageBytesPerChar(), encoder.maxBytesPerChar());
	this.encoder = encoder;
}