Java Code Examples for java.nio.charset.CharsetDecoder#onMalformedInput()

The following examples show how to use java.nio.charset.CharsetDecoder#onMalformedInput() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Text.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
private static String decode(ByteBuffer utf8, boolean replace) 
  throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}
 
Example 2
Source File: UnicodeHelper.java    From p4ic4idea with Apache License 2.0 6 votes vote down vote up
/**
 * Try to determine whether a byte buffer's character encoding is that of the
 * passed-in charset. Uses inefficient
 * heuristics that will be revisited when we're more familiar with likely
 * usage patterns.
 * 
 * Note this has been heavily changed since inception and will
 * almost certainly disappear in the 10.x timeframe -- HR.
 */
public static boolean inferCharset(byte[] bytes, int bytesRead, Charset clientCharset) {
	ByteBuffer byteBuf = ByteBuffer.wrap(bytes, 0, bytesRead);
	CharBuffer charBuf = CharBuffer.allocate(byteBuf.capacity() * 2);
	
	if (clientCharset != null) {
		CharsetDecoder decoder = clientCharset.newDecoder();
		decoder.onMalformedInput(CodingErrorAction.REPORT);
		decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
		CoderResult coderResult = decoder.decode(byteBuf, charBuf, false);
		if (coderResult != null) {
			if (coderResult.isError()) {
				// Wasn't this one...
				return false;
			} else {
				return true;	// Still only *probably* true, dammit...
			}
		}
	}
	
	return true;
}
 
Example 3
Source File: CharsetUtil.java    From android-netty with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a cached thread-local {@link CharsetDecoder} for the specified
 * <tt>charset</tt>.
 */
public static CharsetDecoder getDecoder(Charset charset) {
    if (charset == null) {
        throw new NullPointerException("charset");
    }

    Map<Charset, CharsetDecoder> map = decoders.get();
    CharsetDecoder d = map.get(charset);
    if (d != null) {
        d.reset();
        d.onMalformedInput(CodingErrorAction.REPLACE);
        d.onUnmappableCharacter(CodingErrorAction.REPLACE);
        return d;
    }

    d = charset.newDecoder();
    d.onMalformedInput(CodingErrorAction.REPLACE);
    d.onUnmappableCharacter(CodingErrorAction.REPLACE);
    map.put(charset, d);
    return d;
}
 
Example 4
Source File: Text.java    From hadoop with Apache License 2.0 6 votes vote down vote up
private static String decode(ByteBuffer utf8, boolean replace) 
  throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}
 
Example 5
Source File: TracingManagedHttpClientConnectionFactory.java    From caravan with Apache License 2.0 6 votes vote down vote up
@Override
public ManagedHttpClientConnection create(final HttpRoute route, final ConnectionConfig config) {
    final ConnectionConfig cconfig = config != null ? config : ConnectionConfig.DEFAULT;
    CharsetDecoder chardecoder = null;
    CharsetEncoder charencoder = null;
    final Charset charset = cconfig.getCharset();
    final CodingErrorAction malformedInputAction = cconfig.getMalformedInputAction() != null ? cconfig.getMalformedInputAction() : CodingErrorAction.REPORT;
    final CodingErrorAction unmappableInputAction = cconfig.getUnmappableInputAction() != null ? cconfig.getUnmappableInputAction()
            : CodingErrorAction.REPORT;
    if (charset != null) {
        chardecoder = charset.newDecoder();
        chardecoder.onMalformedInput(malformedInputAction);
        chardecoder.onUnmappableCharacter(unmappableInputAction);
        charencoder = charset.newEncoder();
        charencoder.onMalformedInput(malformedInputAction);
        charencoder.onUnmappableCharacter(unmappableInputAction);
    }
    final String id = "http-outgoing-" + Long.toString(COUNTER.getAndIncrement());
    return new TracingManagedHttpClientConnection(id, cconfig.getBufferSize(), cconfig.getFragmentSizeHint(), chardecoder, charencoder,
            cconfig.getMessageConstraints(), incomingContentStrategy, outgoingContentStrategy, requestWriterFactory, responseParserFactory, logFunc);
}
 
Example 6
Source File: NetStringUtil.java    From cronet with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Convert text in a given character set to a Unicode string.  Any invalid
 * characters are replaced with U+FFFD.  Returns null if the character set
 * is not recognized.
 * @param text ByteBuffer containing the character array to convert.
 * @param charsetName Character set it's in encoded in.
 * @return: Unicode string on success, null on failure.
 */
@CalledByNative
private static String convertToUnicodeWithSubstitutions(
        ByteBuffer text,
        String charsetName) {
    try {
        Charset charset = Charset.forName(charsetName);

        // TODO(mmenke):  Investigate if Charset.decode() can be used
        // instead.  The question is whether it uses the proper replace
        // character.  JDK CharsetDecoder docs say U+FFFD is the default,
        // but Charset.decode() docs say it uses the "charset's default
        // replacement byte array".
        CharsetDecoder decoder = charset.newDecoder();
        decoder.onMalformedInput(CodingErrorAction.REPLACE);
        decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
        decoder.replaceWith("\uFFFD");
        return decoder.decode(text).toString();
    } catch (Exception e) {
        return null;
    }
}
 
Example 7
Source File: Text.java    From RDFS with Apache License 2.0 6 votes vote down vote up
private static String decode(ByteBuffer utf8, boolean replace) 
  throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}
 
Example 8
Source File: UnicodeHelper.java    From p4ic4idea with Apache License 2.0 6 votes vote down vote up
/**
 * Try to determine whether a byte buffer's character encoding is that of the
 * passed-in charset. Uses inefficient
 * heuristics that will be revisited when we're more familiar with likely
 * usage patterns.
 * 
 * Note this has been heavily changed since inception and will
 * almost certainly disappear in the 10.x timeframe -- HR.
 */
public static boolean inferCharset(byte[] bytes, int bytesRead, Charset clientCharset) {
	ByteBuffer byteBuf = ByteBuffer.wrap(bytes, 0, bytesRead);
	CharBuffer charBuf = CharBuffer.allocate(byteBuf.capacity() * 2);
	
	if (clientCharset != null) {
		CharsetDecoder decoder = clientCharset.newDecoder();
		decoder.onMalformedInput(CodingErrorAction.REPORT);
		decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
		CoderResult coderResult = decoder.decode(byteBuf, charBuf, false);
		if (coderResult != null) {
			if (coderResult.isError()) {
				// Wasn't this one...
				return false;
			} else {
				return true;	// Still only *probably* true, dammit...
			}
		}
	}
	
	return true;
}
 
Example 9
Source File: Text.java    From Canova with Apache License 2.0 6 votes vote down vote up
private static String decode(ByteBuffer utf8, boolean replace)
        throws CharacterCodingException {
    CharsetDecoder decoder = DECODER_FACTORY.get();
    if (replace) {
        decoder.onMalformedInput(
                java.nio.charset.CodingErrorAction.REPLACE);
        decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    }
    String str = decoder.decode(utf8).toString();
    // set decoder back to its default value: REPORT
    if (replace) {
        decoder.onMalformedInput(CodingErrorAction.REPORT);
        decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
    }
    return str;
}
 
Example 10
Source File: StringUtils.java    From Pydev with Eclipse Public License 1.0 6 votes vote down vote up
public static String safeDecodeByteArray(byte[] b, String baseCharset) {
    try {
        if (baseCharset == null) {
            return new String(b, StandardCharsets.ISO_8859_1);
        }
        return new String(b, baseCharset);
    } catch (Exception e) {
        try {
            //If it fails, go for something which shouldn't fail!
            CharsetDecoder decoder = Charset.forName(baseCharset).newDecoder();
            decoder.onMalformedInput(CodingErrorAction.IGNORE);
            decoder.onUnmappableCharacter(CodingErrorAction.IGNORE);
            CharBuffer parsed = decoder.decode(ByteBuffer.wrap(b, 0, b.length));
            return parsed.toString();
        } catch (Exception e2) {
            Log.log(e2);
            //Shouldn't ever happen!
            return new String("Unable to decode bytearray from Python.");
        }
    }
}
 
Example 11
Source File: Text.java    From Bats with Apache License 2.0 6 votes vote down vote up
private static String decode(ByteBuffer utf8, boolean replace)
    throws CharacterCodingException {
  CharsetDecoder decoder = DECODER_FACTORY.get();
  if (replace) {
    decoder.onMalformedInput(
        java.nio.charset.CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
  }
  String str = decoder.decode(utf8).toString();
  // set decoder back to its default value: REPORT
  if (replace) {
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  }
  return str;
}
 
Example 12
Source File: LuceneUtil.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
/**
 * Truncate a single field so that it does not exceed Lucene's byte size limit on indexed terms.
 *
 * @param field the string to be indexed
 * @return a string that can be indexed which is within Lucene's byte size limit, or null if anything goes wrong
 */
public static String truncateIndexField(String field) {
    if (field == null) {
        return field;
    }

    Charset charset = Charset.defaultCharset();
    byte[] bytes = field.getBytes(charset);
    if (bytes.length <= IndexWriter.MAX_TERM_LENGTH) {
        return field;
    }

    // chop the field to maximum allowed byte length
    ByteBuffer bbuf = ByteBuffer.wrap(bytes, 0, IndexWriter.MAX_TERM_LENGTH);

    try {
        // decode the chopped byte buffer back into original charset
        CharsetDecoder decoder = charset.newDecoder();
        decoder.onMalformedInput(CodingErrorAction.IGNORE);
        decoder.reset();
        CharBuffer cbuf = decoder.decode(bbuf);
        return cbuf.toString();
    } catch (CharacterCodingException shouldNotHappen) {}

    // if we get here, something bad has happened
    return null;
}
 
Example 13
Source File: AbstractMatcher.java    From netbeans with Apache License 2.0 5 votes vote down vote up
public CharsetDecoder prepareDecoder(Charset charset) {
    CharsetDecoder decoder = charset.newDecoder();
    if (strict) {
        decoder.onMalformedInput(CodingErrorAction.REPORT);
        decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
    } else {
        decoder.onMalformedInput(CodingErrorAction.IGNORE);
        decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    }
    return decoder;
}
 
Example 14
Source File: Message.java    From SI with BSD 2-Clause "Simplified" License 5 votes vote down vote up
public String getPayloadTracingString() {
	if (null == payload || 0 == payload.length)
		return "no payload";
	boolean text = true;
	for (byte b:payload) {
		if (' ' > b) {
			switch(b) {
			case '\t':
			case '\n':
			case '\r':
				continue;
			}
			text = false;
			break;
		}
	}
	if (text) {
		CharsetDecoder decoder = CoAP.UTF8_CHARSET.newDecoder();
		decoder.onMalformedInput(CodingErrorAction.REPORT);
		decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
		ByteBuffer in = ByteBuffer.wrap(payload);
		CharBuffer out = CharBuffer.allocate(24);
		CoderResult result = decoder.decode(in, out, true);
		decoder.flush(out);
		out.flip();
		if (CoderResult.OVERFLOW == result) {
			return "\"" + out +  "\".. " + payload.length + " bytes";
		} else if (!result.isError()){
			return "\"" + out + "\"" ;
		}
	}
	return Utils.toHexText(payload, 256);
}
 
Example 15
Source File: JsonReader.java    From jsondb-core with MIT License 5 votes vote down vote up
public JsonReader(JsonDBConfig dbConfig, File collectionFile) throws IOException {
  this.collectionFile = collectionFile;
  this.lockFilesLocation = new File(collectionFile.getParentFile(), "lock");
  this.fileLockLocation = new File(lockFilesLocation, collectionFile.getName() + ".lock");
  
  if(!lockFilesLocation.exists()) {
    lockFilesLocation.mkdirs();
  }
  if(!fileLockLocation.exists()) {
    fileLockLocation.createNewFile();
  }

  CharsetDecoder decoder = dbConfig.getCharset().newDecoder();
  decoder.onMalformedInput(CodingErrorAction.REPORT);
  decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
  
  raf = new RandomAccessFile(fileLockLocation, "rw");
  channel = raf.getChannel();
  try {
    lock = channel.lock();
  } catch (IOException | OverlappingFileLockException e) {
    try {
      channel.close();
      raf.close();
    } catch (IOException e1) {
      logger.error("Failed while closing RandomAccessFile for collection file {}", collectionFile.getName());
    }
    throw new JsonFileLockException("JsonReader failed to obtain a file lock for file " + fileLockLocation, e);
  }

  fis = new FileInputStream(collectionFile);
  isr = new InputStreamReader(fis, decoder);
  reader = new BufferedReader(isr);
}
 
Example 16
Source File: InputStreamReaderTest.java    From j2objc with Apache License 2.0 5 votes vote down vote up
public void test_read_1() throws IOException {
    // if the decoder is constructed by InputStreamReader itself, the
    // decoder's default error action is REPLACE
    InputStreamReader isr = new InputStreamReader(new ByteArrayInputStream(
            new byte[] { -32, -96 }), "UTF-8");
    assertEquals("read() return incorrect value", 65533, isr.read());

    InputStreamReader isr2 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), Charset
            .forName("UTF-8"));
    assertEquals("read() return incorrect value", 65533, isr2.read());

    // if the decoder is passed in, keep its status intact
    CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
    decoder.onMalformedInput(CodingErrorAction.REPORT);
    InputStreamReader isr3 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), decoder);
    try {
        isr3.read();
        fail("Should throw MalformedInputException");
    } catch (MalformedInputException e) {
        // expected
    }

    CharsetDecoder decoder2 = Charset.forName("UTF-8").newDecoder();
    decoder2.onMalformedInput(CodingErrorAction.IGNORE);
    InputStreamReader isr4 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), decoder2);
    assertEquals("read() return incorrect value", -1, isr4.read());

    CharsetDecoder decoder3 = Charset.forName("UTF-8").newDecoder();
    decoder3.onMalformedInput(CodingErrorAction.REPLACE);
    InputStreamReader isr5 = new InputStreamReader(
            new ByteArrayInputStream(new byte[] { -32, -96 }), decoder3);
    assertEquals("read() return incorrect value", 65533, isr5.read());
}
 
Example 17
Source File: CharacterEncodingExamples.java    From tutorials with MIT License 5 votes vote down vote up
static String decodeText(String input, Charset charset, CodingErrorAction codingErrorAction) throws IOException {
    CharsetDecoder charsetDecoder = charset.newDecoder();
    charsetDecoder.onMalformedInput(codingErrorAction);
    return new BufferedReader(
      new InputStreamReader(
        new ByteArrayInputStream(input.getBytes()), charsetDecoder)).readLine();
}
 
Example 18
Source File: AbstractTextParser.java    From CloverETL-Engine with GNU Lesser General Public License v2.1 4 votes vote down vote up
/**
 * Sets the given decoder to replace all malformed and unmappable characters.
 * @param decoder
 */
protected static void setLenientDecoder(CharsetDecoder decoder) {
	decoder.onMalformedInput(CodingErrorAction.REPLACE);
	decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
}
 
Example 19
Source File: NonBlockingFetcher.java    From SEAL with Apache License 2.0 4 votes vote down vote up
private static void processDocuments() {
    CharsetDecoder decoder = charset.newDecoder();
    decoder.onMalformedInput(CodingErrorAction.IGNORE);
    decoder.onUnmappableCharacter(CodingErrorAction.IGNORE);
    
    // perform 1 to 2-pass decoding on every document
    for (int i = 0; i < works.length; i++) {
      documents.add(null);
      if (works[i] == null)
        continue;
      
      URL url = works[i].url;
      log.debug("[" + (i + 1) + "/" + urls.length + "] Processing: " + url);
      String encoding = DEFAULT_ENCODING;
      String doc = null;
      ByteBuffer buffer = works[i].buffer;
      buffer.flip();
      
      try {
        // try to use default encoding to decode the document
        doc = decoder.decode(buffer).toString();
        
        // identify encoding by looking into the <meta> tag
        Matcher m = CHARSET_PAT.matcher(doc);
        if (m.find()) {
          encoding = m.group(1).toUpperCase();
          log.debug("Encoding identified as: " + encoding);
        } else {
          log.debug("Encoding could not be identified! Using the default: " + DEFAULT_ENCODING);
        }

        // if the identified encoding is different from the default encoding
        if (!encoding.equals(DEFAULT_ENCODING)) {
          // decode again using the identified encoding 
          CharsetDecoder d = Charset.forName(encoding).newDecoder();
          d.onUnmappableCharacter(CodingErrorAction.IGNORE);
          d.onMalformedInput(CodingErrorAction.IGNORE);
          buffer.flip();
          doc = d.decode(buffer).toString();
        }
      } catch (Exception e) {
        log.error("Character coding error: " + e);
        continue;
      }
      documents.set(i, removeHTTPHeader(doc));
//      doc = removeHTTPHeader(doc);
//      doc = CacheRecoverer.recover(url, doc);
//      documents.set(i, doc);
    }
  }
 
Example 20
Source File: ConvertCharacterSet.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }

    final ComponentLog logger = getLogger();

    final Charset inputCharset = Charset.forName(context.getProperty(INPUT_CHARSET).evaluateAttributeExpressions(flowFile).getValue());
    final Charset outputCharset = Charset.forName(context.getProperty(OUTPUT_CHARSET).evaluateAttributeExpressions(flowFile).getValue());
    final CharBuffer charBuffer = CharBuffer.allocate(MAX_BUFFER_SIZE);

    final CharsetDecoder decoder = inputCharset.newDecoder();
    decoder.onMalformedInput(CodingErrorAction.REPLACE);
    decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    decoder.replaceWith("?");

    final CharsetEncoder encoder = outputCharset.newEncoder();
    encoder.onMalformedInput(CodingErrorAction.REPLACE);
    encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
    encoder.replaceWith("?".getBytes(outputCharset));

    try {
        final StopWatch stopWatch = new StopWatch(true);
        flowFile = session.write(flowFile, new StreamCallback() {
            @Override
            public void process(final InputStream rawIn, final OutputStream rawOut) throws IOException {
                try (final BufferedReader reader = new BufferedReader(new InputStreamReader(rawIn, decoder), MAX_BUFFER_SIZE);
                        final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(rawOut, encoder), MAX_BUFFER_SIZE)) {
                    int charsRead;
                    while ((charsRead = reader.read(charBuffer)) != -1) {
                        charBuffer.flip();
                        writer.write(charBuffer.array(), 0, charsRead);
                    }

                    writer.flush();
                }
            }
        });

        session.getProvenanceReporter().modifyContent(flowFile, stopWatch.getElapsed(TimeUnit.MILLISECONDS));
        logger.info("successfully converted characters from {} to {} for {}",
                new Object[]{inputCharset, outputCharset, flowFile});
        session.transfer(flowFile, REL_SUCCESS);
    } catch (final Exception e) {
        throw new ProcessException(e);
    }
}