org.apache.commons.io.ByteOrderMark Java Examples

The following examples show how to use org.apache.commons.io.ByteOrderMark. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EncodingSniffer.java    From htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Attempts to sniff an encoding from a <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
 * in the specified byte array.
 *
 * @param bytes the bytes to check for a Byte Order Mark
 * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
 *         could not be determined
 */
static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
    if (bytes == null) {
        return null;
    }

    Charset encoding = null;
    if (startsWith(bytes, ByteOrderMark.UTF_8)) {
        encoding = UTF_8;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
        encoding = UTF_16BE;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
        encoding = UTF_16LE;
    }

    if (encoding != null && LOG.isDebugEnabled()) {
        LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
    }
    return encoding;
}
 
Example #2
Source File: HtmlScript2Test.java    From htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * @throws Exception if the test fails
 */
@Test
@Alerts("\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627"
        + "\u064b\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627\u064b")
public void incorrectCharset() throws Exception {
    final String html
        = "<html><head>\n"
        + "  <script src='" + URL_SECOND + "' charset='" + ISO_8859_1 + "'></script>\n"
        + "</head>\n"
        + "<body></body>\n"
        + "</html>";

    final String script = new String(ByteOrderMark.UTF_8.getBytes())
            + "alert('" + "\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627"
                        + "\u064b\u0623\u0647\u0644\u0627\u064b\u0623\u0647\u0644\u0627\u064b" + "');";
    getMockWebConnection().setResponse(URL_SECOND, script, MimeType.APPLICATION_JAVASCRIPT, UTF_8);
    loadPageWithAlerts2(html);
}
 
Example #3
Source File: TextInput.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private final boolean checkBom(ByteOrderMark bom) {
  int bomLength = bom.length();
  if (bufferPtr + bomLength >= length) {
    // Not enough bytes from the current position to the end of the buffer
    return false;
  }
  if (BoundsChecking.BOUNDS_CHECKING_ENABLED) {
    buffer.checkBytes(bufferPtr - 1, bufferPtr + bomLength);
  }

  byte[] bomBytes = bom.getBytes();
  for (int i = 0; i < bomLength; i++) {
    byte nextChar = PlatformDependent.getByte(bStartMinus1 + bufferPtr + i);
    if (nextChar != bomBytes[i]) {
      // No BOM. Position is unchanged
      return false;
    }
  }
  return true;
}
 
Example #4
Source File: EncodingSniffer.java    From HtmlUnit-Android with Apache License 2.0 6 votes vote down vote up
/**
 * Attempts to sniff an encoding from a <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte Order Mark</a>
 * in the specified byte array.
 *
 * @param bytes the bytes to check for a Byte Order Mark
 * @return the encoding sniffed from the specified bytes, or {@code null} if the encoding
 *         could not be determined
 */
static Charset sniffEncodingFromUnicodeBom(final byte[] bytes) {
    if (bytes == null) {
        return null;
    }

    Charset encoding = null;
    if (startsWith(bytes, ByteOrderMark.UTF_8)) {
        encoding = UTF_8;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16BE)) {
        encoding = UTF_16BE;
    }
    else if (startsWith(bytes, ByteOrderMark.UTF_16LE)) {
        encoding = UTF_16LE;
    }

    if (encoding != null && LOG.isDebugEnabled()) {
        LOG.debug("Encoding found in Unicode Byte Order Mark: '" + encoding + "'.");
    }
    return encoding;
}
 
Example #5
Source File: TestNewTextReader.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Test
public void testBomUtf8() throws Exception {
  // Simple .csv file with a UTF-8 BOM. Should read successfully
  File testFolder = tempDir.newFolder("testUtf8Folder");
  File testFile = new File(testFolder, "utf8.csv");
  PrintStream p = new PrintStream(testFile);
  p.write(ByteOrderMark.UTF_8.getBytes(), 0, ByteOrderMark.UTF_8.length());
  p.print("A,B\n");
  p.print("5,7\n");
  p.close();

  testBuilder()
    .sqlQuery(String.format("select * from table(dfs.\"%s\" (type => 'text', " +
      "fieldDelimiter => ',', lineDelimiter => '\n', extractHeader => true))",
      testFile.getAbsolutePath()))
    .unOrdered()
    .baselineColumns("A","B")
    .baselineValues("5", "7")
    .go();
}
 
Example #6
Source File: TestNewTextReader.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Test
public void testErrorBomUtf16() throws Exception {
  // UTF-16 BOM should cause a dataReadError user exception
  File testFolder = tempDir.newFolder("testUtf16Folder");
  File testFile = new File(testFolder, "utf16.csv");
  PrintStream p = new PrintStream(testFile);
  p.write(ByteOrderMark.UTF_16LE.getBytes(), 0, ByteOrderMark.UTF_16LE.length());
  p.print("A,B\n");
  p.print("5,7\n");
  p.close();

  thrownException.expect(new UserExceptionMatcher(UserBitShared.DremioPBError.ErrorType.DATA_READ,
    "DATA_READ ERROR: UTF-16 files not supported"));
  // NB: using test() instead of testBuilder() because it unwraps the thrown RpcException and re-throws the
  // underlying UserException (which is then matched with the UserExceptionMatcher)
  test(String.format("select * from table(dfs.\"%s\" (type => 'text', " +
      "fieldDelimiter => ',', lineDelimiter => '\n', extractHeader => true))",
    testFile.getAbsolutePath()));
}
 
Example #7
Source File: SubmitAndSyncUtf16FileTypeTest.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}
 
Example #8
Source File: BOMInputStream.java    From lams with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Return the BOM (Byte Order Mark).
 * 
 * @return The BOM or null if none
 * @throws IOException
 *             if an error reading the first bytes of the stream occurs
 */
public ByteOrderMark getBOM() throws IOException {
    if (firstBytes == null) {
        fbLength = 0;
        // BOMs are sorted from longest to shortest
        final int maxBomSize = boms.get(0).length();
        firstBytes = new int[maxBomSize];
        // Read first maxBomSize bytes
        for (int i = 0; i < firstBytes.length; i++) {
            firstBytes[i] = in.read();
            fbLength++;
            if (firstBytes[i] < 0) {
                break;
            }
        }
        // match BOM in firstBytes
        byteOrderMark = find();
        if (byteOrderMark != null) {
            if (!include) {
                if (byteOrderMark.length() < firstBytes.length) {
                    fbIndex = byteOrderMark.length();
                } else {
                    fbLength = 0;
                }
            }
        }
    }
    return byteOrderMark;
}
 
Example #9
Source File: BOMInputStream.java    From lams with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Find a BOM with the specified bytes.
 * 
 * @return The matched BOM or null if none matched
 */
private ByteOrderMark find() {
    for (final ByteOrderMark bom : boms) {
        if (matches(bom)) {
            return bom;
        }
    }
    return null;
}
 
Example #10
Source File: BOMInputStream.java    From lams with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Check if the bytes match a BOM.
 * 
 * @param bom
 *            The BOM
 * @return true if the bytes match the bom, otherwise false
 */
private boolean matches(final ByteOrderMark bom) {
    // if (bom.length() != fbLength) {
    // return false;
    // }
    // firstBytes may be bigger than the BOM bytes
    for (int i = 0; i < bom.length(); i++) {
        if (bom.get(i) != firstBytes[i]) {
            return false;
        }
    }
    return true;
}
 
Example #11
Source File: TextRecordWriter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Override
public void startPartition(WritePartition partition) throws Exception {

  if(this.partition != null){
    close();
  }

  this.partition = partition;
  // open a new file for writing data with new schema
  try {
    this.path = fs.canonicalizePath(partition.qualified(location, prefix + "_" + index + "." + extension));
    dos = new DataOutputStream(fs.create(path));
    stream = new PrintStream(dos);
    stream.write(ByteOrderMark.UTF_8.getBytes(), 0, ByteOrderMark.UTF_8.length());
    logger.debug("Created file: {}", path);
  } catch (IOException e) {
    throw UserException.dataWriteError(e)
      .message("Failure while attempting to write file %s.", path)
      .build(logger);
  }
  index++;

  String columns = Joiner.on(fieldDelimiter).join(columnNames);
  stream.print(columns);
  stream.print(lineDelimiter);

}
 
Example #12
Source File: TextInput.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private final void skipOptionalBOM() throws IOException {
  if (checkBom(ByteOrderMark.UTF_8)) {
    bufferPtr += ByteOrderMark.UTF_8.length();
  } else if (checkBom(ByteOrderMark.UTF_16LE) || checkBom(ByteOrderMark.UTF_16BE)) {
    throw UserException.dataReadError()
      .message("UTF-16 files not supported")
      .build(logger);
  }
}
 
Example #13
Source File: XMLUtils.java    From modernmt with Apache License 2.0 5 votes vote down vote up
public static XMLEventReader createEventReader(InputStream stream) throws XMLStreamException {
    Charset charset = UTF8Charset.get();

    BOMInputStream bomStream = new BOMInputStream(stream, false,
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE);
    try {
        if (bomStream.hasBOM())
            charset = Charset.forName(bomStream.getBOMCharsetName());
    } catch (IOException e) {
        throw new XMLStreamException(e);
    }

    XMLInputFactory factory = XMLInputFactory.newInstance();
    return factory.createXMLEventReader(new XMLFixInputStreamReader(bomStream, charset));
}
 
Example #14
Source File: MD5Digester.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}
 
Example #15
Source File: SubmitAndSyncUtf16FileTypeTest.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}
 
Example #16
Source File: MD5Digester.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}
 
Example #17
Source File: WebResponse.java    From htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
 *
 * Returns the response content as a string, using the specified charset,
 * rather than the charset/encoding specified in the server response.
 * If there is a bom header the charset parameter will be overwritten by the bom.
 * @param encoding the charset/encoding to use to convert the response content into a string
 * @param ignoreUtf8Bom if true utf8 bom header will be ignored
 * @return the response content as a string or null if the content retrieval was failing
 */
public String getContentAsString(final Charset encoding, final boolean ignoreUtf8Bom) {
    if (responseData_ != null) {
        try (InputStream in = responseData_.getInputStreamWithBomIfApplicable(BOM_HEADERS)) {
            if (in instanceof BOMInputStream) {
                try (BOMInputStream bomIn = (BOMInputStream) in) {
                    // there seems to be a bug in BOMInputStream
                    // we have to call this before hasBOM(ByteOrderMark)
                    if (bomIn.hasBOM()) {
                        if (!ignoreUtf8Bom && bomIn.hasBOM(ByteOrderMark.UTF_8)) {
                            return IOUtils.toString(bomIn, UTF_8);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
                            return IOUtils.toString(bomIn, UTF_16BE);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
                            return IOUtils.toString(bomIn, UTF_16LE);
                        }
                    }
                    return IOUtils.toString(bomIn, encoding);
                }
            }

            return IOUtils.toString(in, encoding);
        }
        catch (final IOException e) {
            LOG.warn(e.getMessage(), e);
        }
    }
    return null;
}
 
Example #18
Source File: MD5Digester.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}
 
Example #19
Source File: SubmitAndSyncUtf16FileTypeTest.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}
 
Example #20
Source File: MD5Digester.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}
 
Example #21
Source File: SubmitAndSyncUtf16FileTypeTest.java    From p4ic4idea with Apache License 2.0 5 votes vote down vote up
private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}
 
Example #22
Source File: StreamDecoder.java    From batfish with Apache License 2.0 5 votes vote down vote up
private static @Nonnull BOMInputStream bomInputStream(@Nonnull InputStream inputStream) {
  return new BOMInputStream(
      inputStream,
      ByteOrderMark.UTF_8,
      ByteOrderMark.UTF_16BE,
      ByteOrderMark.UTF_16LE,
      ByteOrderMark.UTF_32BE,
      ByteOrderMark.UTF_32LE);
}
 
Example #23
Source File: ChakraTest.java    From es6draft with MIT License 5 votes vote down vote up
private static Charset charsetFor(BOMInputStream bis) throws IOException {
    ByteOrderMark bom = bis.getBOM();
    if (ByteOrderMark.UTF_8.equals(bom)) {
        return StandardCharsets.UTF_8;
    }
    if (ByteOrderMark.UTF_16LE.equals(bom)) {
        return StandardCharsets.UTF_16LE;
    }
    if (ByteOrderMark.UTF_16BE.equals(bom)) {
        return StandardCharsets.UTF_16BE;
    }
    return StandardCharsets.UTF_8;
}
 
Example #24
Source File: StreamUtil.java    From iaf with Apache License 2.0 5 votes vote down vote up
/**
 * Return a Reader that reads the InputStream in the character set specified by the BOM. If no BOM is found, a default character set is used.
 */
public static Reader getCharsetDetectingInputStreamReader(InputStream inputStream, String defaultCharset) throws IOException {
	BOMInputStream bOMInputStream = new BOMInputStream(inputStream,ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE);
	ByteOrderMark bom = bOMInputStream.getBOM();
	String charsetName = bom == null ? defaultCharset : bom.getCharsetName();
	return new InputStreamReader(new BufferedInputStream(bOMInputStream), charsetName);
}
 
Example #25
Source File: CharsetIdentification.java    From storm-crawler with Apache License 2.0 5 votes vote down vote up
/**
 * Detects any BOMs and returns the corresponding charset
 */
private static String getCharsetFromBOM(final byte[] byteData) {
    try (BOMInputStream bomIn = new BOMInputStream(
            new ByteArrayInputStream(byteData))) {
        ByteOrderMark bom = bomIn.getBOM();
        if (bom != null) {
            return bom.getCharsetName();
        }
    } catch (IOException e) {
        return null;
    }
    return null;
}
 
Example #26
Source File: BOMInputStream.java    From aion-germany with GNU General Public License v3.0 5 votes vote down vote up
public int compare(ByteOrderMark bom1, ByteOrderMark bom2) {
    int len1 = bom1.length();
    int len2 = bom2.length();
    if (len1 > len2) {
        return -1;
    }
    if (len2 > len1) {
        return 1;
    }
    return 0;
}
 
Example #27
Source File: CsvInput.java    From hop with Apache License 2.0 5 votes vote down vote up
String[] readFieldNamesFromFile( String fileName, CsvInputMeta csvInputMeta ) throws HopException {
  String delimiter = environmentSubstitute( csvInputMeta.getDelimiter() );
  String enclosure = environmentSubstitute( csvInputMeta.getEnclosure() );
  String realEncoding = environmentSubstitute( csvInputMeta.getEncoding() );

  try ( FileObject fileObject = HopVfs.getFileObject( fileName );
        BOMInputStream inputStream =
          new BOMInputStream( HopVfs.getInputStream( fileObject ), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE ) ) {
    InputStreamReader reader = null;
    if ( Utils.isEmpty( realEncoding ) ) {
      reader = new InputStreamReader( inputStream );
    } else {
      reader = new InputStreamReader( inputStream, realEncoding );
    }
    EncodingType encodingType = EncodingType.guessEncodingType( reader.getEncoding() );
    String line =
      TextFileInput.getLine( log, reader, encodingType, TextFileInputMeta.FILE_FORMAT_UNIX, new StringBuilder(
        1000 ) );
    String[] fieldNames = TextFileLineUtil.guessStringsFromLine( log, line, delimiter, enclosure, csvInputMeta.getEscapeCharacter() );
    if ( !Utils.isEmpty( csvInputMeta.getEnclosure() ) ) {
      removeEnclosure( fieldNames, csvInputMeta.getEnclosure() );
    }
    trimFieldNames( fieldNames );
    return fieldNames;
  } catch ( IOException e ) {
    throw new HopFileException( BaseMessages.getString( PKG, "CsvInput.Exception.CreateFieldMappingError" ), e );
  }
}
 
Example #28
Source File: WebResponse.java    From HtmlUnit-Android with Apache License 2.0 5 votes vote down vote up
/**
 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
 *
 * Returns the response content as a string, using the specified charset,
 * rather than the charset/encoding specified in the server response.
 * If there is a bom header the charset parameter will be overwritten by the bom.
 * @param encoding the charset/encoding to use to convert the response content into a string
 * @param ignoreUtf8Bom if true utf8 bom header will be ignored
 * @return the response content as a string or null if the content retrieval was failing
 */
public String getContentAsString(final Charset encoding, final boolean ignoreUtf8Bom) {
    if (responseData_ != null) {
        try (InputStream in = responseData_.getInputStream()) {
            if (in != null) {
                try (BOMInputStream bomIn = new BOMInputStream(in, BOM_HEADERS)) {
                    // there seems to be a bug in BOMInputStream
                    // we have to call this before hasBOM(ByteOrderMark)
                    if (bomIn.hasBOM()) {
                        if (!ignoreUtf8Bom && bomIn.hasBOM(ByteOrderMark.UTF_8)) {
                            return IOUtils.toString(bomIn, UTF_8);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
                            return IOUtils.toString(bomIn, UTF_16BE);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
                            return IOUtils.toString(bomIn, UTF_16LE);
                        }
                    }
                    return IOUtils.toString(bomIn, encoding);
                }
            }
        }
        catch (final IOException e) {
            LOG.warn(e);
        }
    }
    return null;
}
 
Example #29
Source File: BOMInputStream.java    From aion-germany with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
 * 
 * @param delegate
 *            the InputStream to delegate to
 * @param include
 *            true to include the specified BOMs or false to exclude them
 * @param boms
 *            The BOMs to detect and optionally exclude
 */
public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
    super(delegate);
    if (boms == null || boms.length == 0) {
        throw new IllegalArgumentException("No BOMs specified");
    }
    this.include = include;
    // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
    Arrays.sort(boms, ByteOrderMarkLengthComparator);
    this.boms = Arrays.asList(boms);

}
 
Example #30
Source File: BOMInputStream.java    From aion-germany with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Return the BOM (Byte Order Mark).
 * 
 * @return The BOM or null if none
 * @throws IOException
 *             if an error reading the first bytes of the stream occurs
 */
public ByteOrderMark getBOM() throws IOException {
    if (firstBytes == null) {
        fbLength = 0;
        // BOMs are sorted from longest to shortest
        final int maxBomSize = boms.get(0).length();
        firstBytes = new int[maxBomSize];
        // Read first maxBomSize bytes
        for (int i = 0; i < firstBytes.length; i++) {
            firstBytes[i] = in.read();
            fbLength++;
            if (firstBytes[i] < 0) {
                break;
            }
        }
        // match BOM in firstBytes
        byteOrderMark = find();
        if (byteOrderMark != null) {
            if (!include) {
                if (byteOrderMark.length() < firstBytes.length) {
                    fbIndex = byteOrderMark.length();
                } else {
                    fbLength = 0;
                }
            }
        }
    }
    return byteOrderMark;
}