Java Code Examples for org.apache.commons.io.input.BOMInputStream

The following examples show how to use org.apache.commons.io.input.BOMInputStream. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: quaerite   Source File: ElevateQueryComparer.java    License: Apache License 2.0 6 votes vote down vote up
private static QuerySet loadQueries(Path file) throws Exception {
    QuerySet querySet = new QuerySet();
    Matcher uc = Pattern.compile("[A-Z]").matcher("");
    try (InputStream is = Files.newInputStream(file)) {
        try (Reader reader = new InputStreamReader(new BOMInputStream(is), "UTF-8")) {
            Iterable<CSVRecord> records = CSVFormat.EXCEL
                    .withFirstRecordAsHeader().parse(reader);
            for (CSVRecord record : records) {
                String q = record.get("query");
                Integer c = Integer.parseInt(record.get("count"));
                if (querySet.queries.containsKey(q)) {
                    LOG.warn("duplicate queries?! >" + q + "<");
                }

                querySet.set(q, c);
            }
        }
    }
    LOG.info("loaded " + querySet.queries.size() + " queries");
    return querySet;
}
 
Example 2
Source Project: BUbiNG   Source File: URLRespectsRobots.java    License: Apache License 2.0 6 votes vote down vote up
/** Parses a <code>robots.txt</code> file contained in a {@link FetchData} and
 * returns the corresponding filter as an array of sorted prefixes. HTTP statuses
 * different from 2xx are {@linkplain Logger#warn(String) logged}. HTTP statuses of class 4xx
 * generate an empty filter. HTTP statuses 2xx/3xx cause the tentative parsing of the
 * request content. In the remaining cases we return {@code null}.
 *
 * @param robotsResponse the response containing <code>robots.txt</code>.
 * @param userAgent the string representing the user agent of interest.
 * @return an array of character arrays, which are prefixes of the URLs not to follow, in sorted order,
 * or {@code null}
 */
public static char[][] parseRobotsResponse(final URIResponse robotsResponse, final String userAgent) throws IOException {
	final int status = robotsResponse.response().getStatusLine().getStatusCode();
	if (status / 100 != 2) LOGGER.info("Got status " + status + " while fetching robots: URL was " + robotsResponse.uri());
	if (status / 100 == 4 || status / 100 == 5) return EMPTY_ROBOTS_FILTER; // For status 4xx and 5xx, we consider everything allowed.
	if (status / 100 != 2 && status / 100 != 3) return null; // For status 2xx and 3xx we parse the content. For the rest, we consider everything forbidden.
	// See if BOM is present and compute its length
	BOMInputStream bomInputStream = new BOMInputStream(robotsResponse.response().getEntity().getContent(), true);
	int bomLength = bomInputStream.hasBOM()? bomInputStream.getBOM().length() : 0;
	// Skip BOM, if necessary
	bomInputStream.skip(bomLength);
	// Parse robots (BOM is ignored, robots are UTF-8, as suggested by https://developers.google.com/search/reference/robots_txt
	char[][] result = parseRobotsReader(new InputStreamReader(bomInputStream, Charsets.UTF_8), userAgent);
	if (LOGGER.isDebugEnabled()) LOGGER.debug("Robots for {} successfully got with status {}: {}", robotsResponse.uri(), Integer.valueOf(status), toString(result));
	return result;
}
 
Example 3
Source Project: rdf4j   Source File: TurtleParser.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Implementation of the <tt>parse(InputStream, String)</tt> method defined in the RDFParser interface.
 *
 * @param in      The InputStream from which to read the data, must not be <tt>null</tt>. The InputStream is
 *                supposed to contain UTF-8 encoded Unicode characters, as per the Turtle specification.
 * @param baseURI The URI associated with the data in the InputStream, must not be <tt>null</tt>.
 * @throws IOException              If an I/O error occurred while data was read from the InputStream.
 * @throws RDFParseException        If the parser has found an unrecoverable parse error.
 * @throws RDFHandlerException      If the configured statement handler encountered an unrecoverable error.
 * @throws IllegalArgumentException If the supplied input stream or base URI is <tt>null</tt>.
 */
@Override
public synchronized void parse(InputStream in, String baseURI)
		throws IOException, RDFParseException, RDFHandlerException {
	if (in == null) {
		throw new IllegalArgumentException("Input stream must not be 'null'");
	}
	// Note: baseURI will be checked in parse(Reader, String)

	try {
		parse(new InputStreamReader(new BOMInputStream(in, false), StandardCharsets.UTF_8), baseURI);
	} catch (UnsupportedEncodingException e) {
		// Every platform should support the UTF-8 encoding...
		throw new RuntimeException(e);
	}
}
 
Example 4
Source Project: rdf4j   Source File: RDFXMLParser.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Parses the data from the supplied InputStream, using the supplied baseURI to resolve any relative URI references.
 *
 * @param in      The InputStream from which to read the data, must not be <tt>null</tt>.
 * @param baseURI The URI associated with the data in the InputStream, must not be <tt>null</tt>.
 * @throws IOException              If an I/O error occurred while data was read from the InputStream.
 * @throws RDFParseException        If the parser has found an unrecoverable parse error.
 * @throws RDFHandlerException      If the configured statement handler encountered an unrecoverable error.
 * @throws IllegalArgumentException If the supplied input stream or base URI is <tt>null</tt>.
 */
@Override
public synchronized void parse(InputStream in, String baseURI)
		throws IOException, RDFParseException, RDFHandlerException {
	if (in == null) {
		throw new IllegalArgumentException("Input stream cannot be 'null'");
	}
	if (baseURI == null) {
		throw new IllegalArgumentException("Base URI cannot be 'null'");
	}

	InputSource inputSource = new InputSource(new BOMInputStream(in, false));
	inputSource.setSystemId(baseURI);

	parse(inputSource);
}
 
Example 5
Source Project: rdf4j   Source File: NTriplesParser.java    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Implementation of the <tt>parse(InputStream, String)</tt> method defined in the RDFParser interface.
 *
 * @param in      The InputStream from which to read the data, must not be <tt>null</tt>. The InputStream is
 *                supposed to contain 7-bit US-ASCII characters, as per the N-Triples specification.
 * @param baseURI The URI associated with the data in the InputStream, must not be <tt>null</tt>.
 * @throws IOException              If an I/O error occurred while data was read from the InputStream.
 * @throws RDFParseException        If the parser has found an unrecoverable parse error.
 * @throws RDFHandlerException      If the configured statement handler encountered an unrecoverable error.
 * @throws IllegalArgumentException If the supplied input stream or base URI is <tt>null</tt>.
 */
@Override
public synchronized void parse(InputStream in, String baseURI)
		throws IOException, RDFParseException, RDFHandlerException {
	if (in == null) {
		throw new IllegalArgumentException("Input stream can not be 'null'");
	}
	// Note: baseURI will be checked in parse(Reader, String)

	try {
		parse(new InputStreamReader(new BOMInputStream(in, false), StandardCharsets.UTF_8), baseURI);
	} catch (UnsupportedEncodingException e) {
		// Every platform should support the UTF-8 encoding...
		throw new RuntimeException(e);
	}
}
 
Example 6
Source Project: mojito   Source File: CommandHelper.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Get content from {@link java.nio.file.Path} using UTF8
 *
 * @param path
 * @return
 * @throws CommandException
 */
public String getFileContent(Path path) {
    try {
        File file = path.toFile();
        BOMInputStream inputStream = new BOMInputStream(FileUtils.openInputStream(file), false, boms);
        String fileContent;
        if (inputStream.hasBOM()) {
            fileContent = IOUtils.toString(inputStream, inputStream.getBOMCharsetName());
        } else {
            fileContent = IOUtils.toString(inputStream, StandardCharsets.UTF_8);
        }
        return fileContent;
    } catch (IOException e) {
        throw new UncheckedIOException("Cannot get file content for path: " + path.toString(), e);
    }
}
 
Example 7
Source Project: zap-extensions   Source File: BinList.java    License: Apache License 2.0 6 votes vote down vote up
private static Trie<String, BinRecord> createTrie() {
    Trie<String, BinRecord> trie = new PatriciaTrie<>();
    Iterable<CSVRecord> records;
    try (InputStream in = BinList.class.getResourceAsStream(BINLIST);
            BOMInputStream bomStream = new BOMInputStream(in);
            InputStreamReader inStream =
                    new InputStreamReader(bomStream, StandardCharsets.UTF_8)) {
        records = CSVFormat.DEFAULT.withFirstRecordAsHeader().parse(inStream).getRecords();
    } catch (NullPointerException | IOException e) {
        LOGGER.warn("Exception while loading: " + BINLIST, e);
        return trie;
    }

    for (CSVRecord record : records) {
        trie.put(
                record.get("bin"),
                new BinRecord(
                        record.get("bin"),
                        record.get("brand"),
                        record.get("category"),
                        record.get("issuer")));
    }
    return trie;
}
 
Example 8
Source Project: webanno   Source File: ConstraintsServiceImpl.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public String readConstrainSet(ConstraintSet aSet)
    throws IOException
{
    String constraintRulesPath = repositoryProperties.getPath().getAbsolutePath() + "/"
            + PROJECT_FOLDER + "/" + aSet.getProject().getId() + "/"
            + ConstraintsService.CONSTRAINTS + "/";
    String filename = aSet.getId() + ".txt";
    
    String data;
    try (BOMInputStream is = new BOMInputStream(
            new FileInputStream(new File(constraintRulesPath, filename)))) {
        data = IOUtils.toString(is, "UTF-8");
    }

    try (MDC.MDCCloseable closable = MDC.putCloseable(Logging.KEY_PROJECT_ID,
            String.valueOf(aSet.getProject().getId()))) {
        log.info("Read constraints set [{}] in project [{}]({})",
                aSet.getName(), aSet.getProject().getName(), aSet.getProject().getId());
    }
    
    return data;
}
 
Example 9
Source Project: crawler-commons   Source File: SiteMapParser.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Decompress the gzipped content and process the resulting XML Sitemap.
 * 
 * @param url
 *            - URL of the gzipped content
 * @param response
 *            - Gzipped content
 * @return the site map
 * @throws UnknownFormatException
 *             if there is an error parsing the gzip
 * @throws IOException
 *             if there is an error reading in the gzip {@link java.net.URL}
 */
protected AbstractSiteMap processGzippedXML(URL url, byte[] response) throws IOException, UnknownFormatException {

    LOG.debug("Processing gzipped XML");

    InputStream is = new ByteArrayInputStream(response);

    // Remove .gz ending
    String xmlUrl = url.toString().replaceFirst("\\.gz$", "");
    LOG.debug("XML url = {}", xmlUrl);

    InputStream decompressed = new SkipLeadingWhiteSpaceInputStream(new BOMInputStream(new GZIPInputStream(is)));
    InputSource in = new InputSource(decompressed);
    in.setSystemId(xmlUrl);
    return processXml(url, in);
}
 
Example 10
Source Project: nifi   Source File: CSVRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
public CSVRecordReader(final InputStream in, final ComponentLog logger, final RecordSchema schema, final CSVFormat csvFormat, final boolean hasHeader, final boolean ignoreHeader,
                       final String dateFormat, final String timeFormat, final String timestampFormat, final String encoding) throws IOException {
    super(logger, schema, hasHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat);

    final Reader reader = new InputStreamReader(new BOMInputStream(in), encoding);

    CSVFormat withHeader;
    if (hasHeader) {
        withHeader = csvFormat.withSkipHeaderRecord();

        if (ignoreHeader) {
            withHeader = withHeader.withHeader(schema.getFieldNames().toArray(new String[0]));
        } else {
            withHeader = withHeader.withFirstRecordAsHeader();
        }
    } else {
        withHeader = csvFormat.withHeader(schema.getFieldNames().toArray(new String[0]));
    }

    csvParser = new CSVParser(reader, withHeader);
}
 
Example 11
Source Project: nifi   Source File: CSVHeaderSchemaStrategy.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordSchema getSchema(Map<String, String> variables, final InputStream contentStream, final RecordSchema readSchema) throws SchemaNotFoundException {
    if (this.context == null) {
        throw new SchemaNotFoundException("Schema Access Strategy intended only for validation purposes and cannot obtain schema");
    }

    try {
        final CSVFormat csvFormat = CSVUtils.createCSVFormat(context, variables).withFirstRecordAsHeader();
        try (final Reader reader = new InputStreamReader(new BOMInputStream(contentStream));
            final CSVParser csvParser = new CSVParser(reader, csvFormat)) {

            final List<RecordField> fields = new ArrayList<>();
            for (final String columnName : csvParser.getHeaderMap().keySet()) {
                fields.add(new RecordField(columnName, RecordFieldType.STRING.getDataType(), true));
            }

            return new SimpleRecordSchema(fields);
        }
    } catch (final Exception e) {
        throw new SchemaNotFoundException("Failed to read Header line from CSV", e);
    }
}
 
Example 12
Source Project: quaerite   Source File: ElevateAnalysisEvaluator.java    License: Apache License 2.0 5 votes vote down vote up
private Map<String, Integer> loadQueries(Path queryFile) throws IOException {
    Map<String, Integer> queries = new HashMap<>();
    try (Reader reader = new InputStreamReader(
            new BOMInputStream(Files.newInputStream(queryFile)), "UTF-8")) {
        Iterable<CSVRecord> records = CSVFormat.EXCEL
                .withFirstRecordAsHeader().parse(reader);
        boolean hasCount = false;
        if ((((CSVParser) records)).getHeaderMap().containsKey("count")) {
            hasCount = true;
        }

        for (CSVRecord r : records) {
            String query = r.get("query");

            query = query.toLowerCase(Locale.US);
            int cnt = 1;
            if (hasCount) {
                String count = r.get("count");
                cnt = Integer.parseInt(count);
            }
            Integer existing = queries.get(query);
            if (existing != null) {
                cnt += existing;
            }
            queries.put(query, cnt);
        }
    }
    return queries;

}
 
Example 13
Source Project: quaerite   Source File: QueryLoader.java    License: Apache License 2.0 5 votes vote down vote up
public static void loadJudgments(ExperimentDB experimentDB, Path file,
                                 boolean freshStart) throws IOException, SQLException {
    if (freshStart) {
        experimentDB.clearJudgments();
    }

    Map<String, Judgments> judgmentsMap = null;
    try (InputStream is = Files.newInputStream(file)) {
        try (Reader reader = new InputStreamReader(new BOMInputStream(is), "UTF-8")) {
            Iterable<CSVRecord> records = CSVFormat.EXCEL
                    .withFirstRecordAsHeader().parse(reader);
            boolean hasJudgments = (((CSVParser) records)).getHeaderMap().containsKey(
                    DOCUMENT_ID) ? true : false;
            boolean hasQuerySet = (((CSVParser) records).getHeaderMap().containsKey(
                    QUERY_SET)) ? true : false;
            boolean hasCount = (((CSVParser) records).getHeaderMap().containsKey(
                    COUNT)) ? true : false;
            boolean hasQueryId = (((CSVParser) records).getHeaderMap().containsKey(
                    QUERY_ID)) ? true : false;
            Set<String> queryStringNames = getQueryStringNames(((CSVParser) records)
                    .getHeaderMap().keySet());
            if (hasQueryId) {
                judgmentsMap = loadJudgmentsWithId(hasJudgments, hasQuerySet, hasCount,
                        queryStringNames, records);
            } else {
                judgmentsMap = loadJudmentsWithoutId(hasJudgments, hasQuerySet,
                        hasCount, queryStringNames, records);
            }
        }
    }
    for (Judgments judgments : judgmentsMap.values()) {
        experimentDB.addJudgment(judgments);
    }
}
 
Example 14
Source Project: hop   Source File: CsvInput.java    License: Apache License 2.0 5 votes vote down vote up
String[] readFieldNamesFromFile( String fileName, CsvInputMeta csvInputMeta ) throws HopException {
  String delimiter = environmentSubstitute( csvInputMeta.getDelimiter() );
  String enclosure = environmentSubstitute( csvInputMeta.getEnclosure() );
  String realEncoding = environmentSubstitute( csvInputMeta.getEncoding() );

  try ( FileObject fileObject = HopVfs.getFileObject( fileName );
        BOMInputStream inputStream =
          new BOMInputStream( HopVfs.getInputStream( fileObject ), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE ) ) {
    InputStreamReader reader = null;
    if ( Utils.isEmpty( realEncoding ) ) {
      reader = new InputStreamReader( inputStream );
    } else {
      reader = new InputStreamReader( inputStream, realEncoding );
    }
    EncodingType encodingType = EncodingType.guessEncodingType( reader.getEncoding() );
    String line =
      TextFileInput.getLine( log, reader, encodingType, TextFileInputMeta.FILE_FORMAT_UNIX, new StringBuilder(
        1000 ) );
    String[] fieldNames = TextFileLineUtil.guessStringsFromLine( log, line, delimiter, enclosure, csvInputMeta.getEscapeCharacter() );
    if ( !Utils.isEmpty( csvInputMeta.getEnclosure() ) ) {
      removeEnclosure( fieldNames, csvInputMeta.getEnclosure() );
    }
    trimFieldNames( fieldNames );
    return fieldNames;
  } catch ( IOException e ) {
    throw new HopFileException( BaseMessages.getString( PKG, "CsvInput.Exception.CreateFieldMappingError" ), e );
  }
}
 
Example 15
Source Project: htmlunit   Source File: WebResponse.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
 *
 * Returns the response content as a string, using the specified charset,
 * rather than the charset/encoding specified in the server response.
 * If there is a bom header the charset parameter will be overwritten by the bom.
 * @param encoding the charset/encoding to use to convert the response content into a string
 * @param ignoreUtf8Bom if true utf8 bom header will be ignored
 * @return the response content as a string or null if the content retrieval was failing
 */
public String getContentAsString(final Charset encoding, final boolean ignoreUtf8Bom) {
    if (responseData_ != null) {
        try (InputStream in = responseData_.getInputStreamWithBomIfApplicable(BOM_HEADERS)) {
            if (in instanceof BOMInputStream) {
                try (BOMInputStream bomIn = (BOMInputStream) in) {
                    // there seems to be a bug in BOMInputStream
                    // we have to call this before hasBOM(ByteOrderMark)
                    if (bomIn.hasBOM()) {
                        if (!ignoreUtf8Bom && bomIn.hasBOM(ByteOrderMark.UTF_8)) {
                            return IOUtils.toString(bomIn, UTF_8);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
                            return IOUtils.toString(bomIn, UTF_16BE);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
                            return IOUtils.toString(bomIn, UTF_16LE);
                        }
                    }
                    return IOUtils.toString(bomIn, encoding);
                }
            }

            return IOUtils.toString(in, encoding);
        }
        catch (final IOException e) {
            LOG.warn(e.getMessage(), e);
        }
    }
    return null;
}
 
Example 16
Source Project: htmlunit   Source File: XmlUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Builds a document from the content of the web response.
 * A warning is logged if an exception is thrown while parsing the XML content
 * (for instance when the content is not a valid XML and can't be parsed).
 *
 * @param webResponse the response from the server
 * @throws IOException if the page could not be created
 * @return the parse result
 * @throws SAXException if the parsing fails
 * @throws ParserConfigurationException if a DocumentBuilder cannot be created
 */
public static Document buildDocument(final WebResponse webResponse)
    throws IOException, SAXException, ParserConfigurationException {

    final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

    if (webResponse == null) {
        return factory.newDocumentBuilder().newDocument();
    }

    factory.setNamespaceAware(true);
    final InputStreamReader reader = new InputStreamReader(
            new BOMInputStream(webResponse.getContentAsStream()),
            webResponse.getContentCharset());

    // we have to do the blank input check and the parsing in one step
    final TrackBlankContentReader tracker = new TrackBlankContentReader(reader);

    final InputSource source = new InputSource(tracker);
    final DocumentBuilder builder = factory.newDocumentBuilder();
    builder.setErrorHandler(DISCARD_MESSAGES_HANDLER);
    builder.setEntityResolver(new EntityResolver() {
        @Override
        public InputSource resolveEntity(final String publicId, final String systemId)
            throws SAXException, IOException {
            return new InputSource(new StringReader(""));
        }
    });
    try {
        // this closes the input source/stream
        return builder.parse(source);
    }
    catch (final SAXException e) {
        if (tracker.wasBlank()) {
            return factory.newDocumentBuilder().newDocument();
        }
        throw e;
    }
}
 
Example 17
Source Project: HtmlUnit-Android   Source File: WebResponse.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
 *
 * Returns the response content as a string, using the specified charset,
 * rather than the charset/encoding specified in the server response.
 * If there is a bom header the charset parameter will be overwritten by the bom.
 * @param encoding the charset/encoding to use to convert the response content into a string
 * @param ignoreUtf8Bom if true utf8 bom header will be ignored
 * @return the response content as a string or null if the content retrieval was failing
 */
public String getContentAsString(final Charset encoding, final boolean ignoreUtf8Bom) {
    if (responseData_ != null) {
        try (InputStream in = responseData_.getInputStream()) {
            if (in != null) {
                try (BOMInputStream bomIn = new BOMInputStream(in, BOM_HEADERS)) {
                    // there seems to be a bug in BOMInputStream
                    // we have to call this before hasBOM(ByteOrderMark)
                    if (bomIn.hasBOM()) {
                        if (!ignoreUtf8Bom && bomIn.hasBOM(ByteOrderMark.UTF_8)) {
                            return IOUtils.toString(bomIn, UTF_8);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
                            return IOUtils.toString(bomIn, UTF_16BE);
                        }
                        if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
                            return IOUtils.toString(bomIn, UTF_16LE);
                        }
                    }
                    return IOUtils.toString(bomIn, encoding);
                }
            }
        }
        catch (final IOException e) {
            LOG.warn(e);
        }
    }
    return null;
}
 
Example 18
Source Project: HtmlUnit-Android   Source File: XmlUtil.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Builds a document from the content of the web response.
 * A warning is logged if an exception is thrown while parsing the XML content
 * (for instance when the content is not a valid XML and can't be parsed).
 *
 * @param webResponse the response from the server
 * @throws IOException if the page could not be created
 * @return the parse result
 * @throws SAXException if the parsing fails
 * @throws ParserConfigurationException if a DocumentBuilder cannot be created
 */
public static Document buildDocument(final WebResponse webResponse)
    throws IOException, SAXException, ParserConfigurationException {

    final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

    if (webResponse == null) {
        return factory.newDocumentBuilder().newDocument();
    }

    factory.setNamespaceAware(true);
    final InputStreamReader reader = new InputStreamReader(
            new BOMInputStream(webResponse.getContentAsStream()),
            webResponse.getContentCharset());

    // we have to do the blank input check and the parsing in one step
    final TrackBlankContentReader tracker = new TrackBlankContentReader(reader);

    final InputSource source = new InputSource(tracker);
    final DocumentBuilder builder = factory.newDocumentBuilder();
    builder.setErrorHandler(DISCARD_MESSAGES_HANDLER);
    builder.setEntityResolver(new EntityResolver() {
        @Override
        public InputSource resolveEntity(final String publicId, final String systemId)
            throws SAXException, IOException {
            return new InputSource(new StringReader(""));
        }
    });
    try {
        // this closes the input source/stream
        return builder.parse(source);
    }
    catch (final SAXException e) {
        if (tracker.wasBlank()) {
            return factory.newDocumentBuilder().newDocument();
        }
        throw e;
    }
}
 
Example 19
Source Project: rdf4j   Source File: TriXParser.java    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Parses the data from the supplied InputStream, using the supplied baseURI to resolve any relative URI references.
 *
 * @param in      The InputStream from which to read the data, must not be <tt>null</tt>.
 * @param baseURI The URI associated with the data in the InputStream, must not be <tt>null</tt>.
 * @throws IOException              If an I/O error occurred while data was read from the InputStream.
 * @throws RDFParseException        If the parser has found an unrecoverable parse error.
 * @throws RDFHandlerException      If the configured statement handler encountered an unrecoverable error.
 * @throws IllegalArgumentException If the supplied input stream or base URI is <tt>null</tt>.
 */
@Override
public void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException {
	if (in == null) {
		throw new IllegalArgumentException("Input stream cannot be 'null'");
	}
	if (baseURI == null) {
		throw new IllegalArgumentException("Base URI cannot be 'null'");
	}

	InputSource inputSource = new InputSource(new BOMInputStream(in, false));
	inputSource.setSystemId(baseURI);

	parse(inputSource);
}
 
Example 20
Source Project: rdf4j   Source File: NQuadsParser.java    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public synchronized void parse(final InputStream inputStream, final String baseURI)
		throws IOException, RDFParseException, RDFHandlerException {
	if (inputStream == null) {
		throw new IllegalArgumentException("Input stream can not be 'null'");
	}
	// Note: baseURI will be checked in parse(Reader, String)

	try {
		parse(new InputStreamReader(new BOMInputStream(inputStream, false), StandardCharsets.UTF_8), baseURI);
	} catch (UnsupportedEncodingException e) {
		// Every platform should support the UTF-8 encoding...
		throw new RuntimeException(e);
	}
}
 
Example 21
Source Project: saml-client   Source File: SamlClient.java    License: MIT License 5 votes vote down vote up
/**
 * Wrap a {@link java.io.Reader Reader} to skip a BOM if it is present.
 * OpenSaml won't accept a metadata file if it starts with a BOM.
 * @param metadata The metadata with optional BOM
 * @return A {@link Reader} which will never return a BOM
 */
private static InputStream skipBom(Reader metadata) throws SamlException {
  try {
    InputStream metadataInputStream;
    metadataInputStream =
        IOUtils.toInputStream(IOUtils.toString(metadata), StandardCharsets.UTF_8);

    return new BOMInputStream(metadataInputStream, false);
  } catch (IOException e) {
    throw new SamlException("Couldn't read metadata", e);
  }
}
 
Example 22
Source Project: mojito   Source File: CommandHelper.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Writes the content into a file using same format as source file
 *
 * @param content         content to be written
 * @param path            path to the file
 * @param sourceFileMatch
 * @throws CommandException
 */
public void writeFileContent(String content, Path path, FileMatch sourceFileMatch) throws CommandException {
    try {
        File outputFile = path.toFile();
        BOMInputStream inputStream = new BOMInputStream(FileUtils.openInputStream(sourceFileMatch.getPath().toFile()), false, boms);
        if (inputStream.hasBOM()) {
            FileUtils.writeByteArrayToFile(outputFile, inputStream.getBOM().getBytes());
            FileUtils.writeByteArrayToFile(outputFile, content.getBytes(inputStream.getBOMCharsetName()), true);
        } else {
            FileUtils.writeStringToFile(outputFile, content, StandardCharsets.UTF_8);
        }
    } catch (IOException e) {
        throw new CommandException("Cannot write file content in path: " + path.toString(), e);
    }
}
 
Example 23
public SourceLine[] getLines(final InputStream inputStream, final Charset charset) {
	if (inputStream == null) {
		return new SourceLine[0];
	}
	final List<SourceLine> sourceLines = new ArrayList<>();

	try (final BufferedReader bufferedReader = new BufferedReader(
			new InputStreamReader(new BOMInputStream(inputStream, false), charset))) {
		int totalLines = 1;
		int global = 0;
		int count = 0;

		int currentChar;
		while ((currentChar = bufferedReader.read()) != -1) {

			global++;
			count++;
			if (currentChar == 10) {
				sourceLines.add(new SourceLine(totalLines, count, global - count, global));
				totalLines++;
				count = 0;
			}

		}
		sourceLines.add(new SourceLine(totalLines, count, global - count, global));
	} catch (final Throwable e) {
		LOGGER.warn("Error occured reading file", e);
	}

	return sourceLines.toArray(new SourceLine[0]);
}
 
Example 24
Source Project: modernmt   Source File: XMLUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static XMLEventReader createEventReader(InputStream stream) throws XMLStreamException {
    Charset charset = UTF8Charset.get();

    BOMInputStream bomStream = new BOMInputStream(stream, false,
            ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE);
    try {
        if (bomStream.hasBOM())
            charset = Charset.forName(bomStream.getBOMCharsetName());
    } catch (IOException e) {
        throw new XMLStreamException(e);
    }

    XMLInputFactory factory = XMLInputFactory.newInstance();
    return factory.createXMLEventReader(new XMLFixInputStreamReader(bomStream, charset));
}
 
Example 25
Source Project: p4ic4idea   Source File: MD5Digester.java    License: Apache License 2.0 5 votes vote down vote up
private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}
 
Example 26
Source Project: p4ic4idea   Source File: SubmitAndSyncUtf16FileTypeTest.java    License: Apache License 2.0 5 votes vote down vote up
private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}
 
Example 27
Source Project: p4ic4idea   Source File: MD5Digester.java    License: Apache License 2.0 5 votes vote down vote up
private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}
 
Example 28
Source Project: p4ic4idea   Source File: SubmitAndSyncUtf16FileTypeTest.java    License: Apache License 2.0 5 votes vote down vote up
private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}
 
Example 29
Source Project: p4ic4idea   Source File: MD5Digester.java    License: Apache License 2.0 5 votes vote down vote up
private void digestEncodedStreamToUtf8(@Nonnull InputStream inStream, @Nonnull Charset charset,
                                       boolean isRequireLineEndingConvert, @Nullable ClientLineEnding clientLineEnding)
		throws IOException {

	try (BOMInputStream unicodeInputStream = new BOMInputStream(inStream, false,
			ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
			ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);

	     InputStreamReader encodedStreamReader = new InputStreamReader(unicodeInputStream,
			     charset)) {
		CharsetEncoder utf8CharsetEncoder = CharsetDefs.UTF8.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);

		char[] buffer = new char[bufferSize];
		int read;
		while ((read = encodedStreamReader.read(buffer)) > 0) {
			// Convert encoded stream to UTF8 since server digest is UTF8
			ByteBuffer utf8ByteBuffer = utf8CharsetEncoder
					.encode(CharBuffer.wrap(buffer, 0, read));

			if (isRequireLineEndingConvert) {
				ByteBuffer convert = findAndReplaceEncodedClientLineEndingIfRequireLineEndingCovert(
						encodedStreamReader, utf8CharsetEncoder, utf8ByteBuffer,
						clientLineEnding);

				update(convert.array(), convert.arrayOffset(), convert.limit());
			} else {
				update(utf8ByteBuffer.array(), utf8ByteBuffer.arrayOffset(),
						utf8ByteBuffer.limit());
			}
		}
	}
}
 
Example 30
Source Project: p4ic4idea   Source File: SubmitAndSyncUtf16FileTypeTest.java    License: Apache License 2.0 5 votes vote down vote up
private long getUtf16FileSizeAfterRemoveBomAndEncodedByUtf8(File testResourceFile, Charset utf16) throws Exception {
    try (BOMInputStream bomSkipedInputStream = new BOMInputStream(
            new FileInputStream(testResourceFile),
            false,
            ByteOrderMark.UTF_16LE,
            ByteOrderMark.UTF_16BE)) {
        byte[] bomSkippedBytes = IOUtils.toByteArray(bomSkipedInputStream);
        ByteBuffer buf = ByteBuffer.wrap(bomSkippedBytes);
		CharsetConverter convert = new CharsetConverter(utf16, CharsetDefs.UTF8);
		return convert.convert(buf).limit();
    }
}