org.mozilla.universalchardet.UniversalDetector Java Examples
The following examples show how to use
org.mozilla.universalchardet.UniversalDetector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CrawlUtils.java From Asqatasun with GNU Affero General Public License v3.0 | 7 votes |
/** * This method extracts the charset from the html source code. * If the charset is not specified, it is set to UTF-8 by default * @param is * @return */ public static String extractCharset(InputStream is) throws java.io.IOException { byte[] buf = new byte[4096]; UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = is.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { LOGGER.debug("Detected encoding = " + encoding); } else { LOGGER.debug("No encoding detected."); } detector.reset(); if (encoding != null && CrawlUtils.isValidCharset(encoding)) { return encoding; } else { return DEFAULT_CHARSET; } }
Example #2
Source File: TaskIo.java From jdotxt with GNU General Public License v3.0 | 6 votes |
private static String detectEncoding(File file) throws IOException { byte[] buf = new byte[4096]; FileInputStream fis = new FileInputStream(file); UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) detector.handleData(buf, 0, nread); Util.closeStream(fis); detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding == null) encoding = DEFAULT_ENCODING; return encoding; }
Example #3
Source File: CsvImporter.java From fingen with Apache License 2.0 | 6 votes |
private String detectCharset() throws IOException { String result = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); byte[] buf = new byte[4096]; try { FileInputStream fis = new FileInputStream(mFileName); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } fis.close(); detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { result = encoding; } detector.reset(); } catch (Exception e) { e.printStackTrace(); } return result; }
Example #4
Source File: UniversalEncodingDetector.java From onedev with MIT License | 6 votes |
public static boolean isBinary(InputStream in) throws IOException { byte[] buf = new byte[4]; in.mark(5); int len = in.read(buf); in.reset(); UniversalDetector detector = new UniversalDetector(null); detector.handleData(buf, 0, len); if (detector.isDone()) { return false; } //Not UTF check ASCII text in.mark(LOOKAHEAD); len = 0; int b; while ((b = in.read()) != -1 && len < (LOOKAHEAD - 192)) { len++; if (b == 0) { in.reset(); return true; } } in.reset(); return false; }
Example #5
Source File: BaseParser.java From substitution-schedule-parser with Mozilla Public License 2.0 | 6 votes |
BaseParser(SubstitutionScheduleData scheduleData, CookieProvider cookieProvider) { this.scheduleData = scheduleData; this.cookieProvider = cookieProvider; this.cookieStore = new BasicCookieStore(); this.colorProvider = new ColorProvider(scheduleData); this.encodingDetector = new UniversalDetector(null); this.debuggingDataHandler = new NoOpDebuggingDataHandler(); this.sardine = null; try { SSLConnectionSocketFactory sslsf = getSslConnectionSocketFactory(scheduleData); CloseableHttpClient httpclient = HttpClients.custom() .setSSLSocketFactory(sslsf) .setRedirectStrategy(new LaxRedirectStrategy()) .setDefaultRequestConfig(RequestConfig.custom() .setCookieSpec(CookieSpecs.STANDARD).build()) .setUserAgent( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36") .build(); this.executor = Executor.newInstance(httpclient).use(cookieStore); } catch (GeneralSecurityException | JSONException | IOException e) { throw new RuntimeException(e); } }
Example #6
Source File: Charset.java From dualsub with GNU General Public License v3.0 | 6 votes |
public static String detect(InputStream inputStream) throws IOException { UniversalDetector detector = Charset.getSingleton() .getCharsetDetector(); byte[] buf = new byte[4096]; int nread; while ((nread = inputStream.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); inputStream.close(); if (encoding == null) { // If none encoding is detected, we assume UTF-8 encoding = UTF8; } return encoding; }
Example #7
Source File: CrawlerPack.java From CrawlerPack with Apache License 2.0 | 5 votes |
/** * Detecting real content encoding * @param content * @param offset * @return real charset encoding */ private String detectCharset(byte[] content, Integer offset){ log.debug("detectCharset: offset=" + offset); // detect failed if( offset > content.length ) return null; UniversalDetector detector = new UniversalDetector(null); detector.handleData(content, offset, content.length - offset > detectBuffer ? detectBuffer : content.length - offset); detector.dataEnd(); String detectEncoding = detector.getDetectedCharset(); return null==detectEncoding?detectCharset(content,offset+detectBuffer):detectEncoding; }
Example #8
Source File: CharsetDetector.java From webarchive-commons with Apache License 2.0 | 5 votes |
/** * Attempts to figure out the character set of the document using * the excellent juniversalchardet library. * * @param resource * @return String character encoding found, or null if nothing looked good. * @throws IOException */ protected String getCharsetFromBytes(byte buffer[], int len) throws IOException { String charsetName = null; UniversalDetector detector = new UniversalDetector(null); detector.handleData(buffer, 0, len); detector.dataEnd(); charsetName = detector.getDetectedCharset(); detector.reset(); if(isCharsetSupported(charsetName)) { return mapCharset(charsetName); } return null; }
Example #9
Source File: FileStorable.java From Readily with MIT License | 5 votes |
public static String guessCharset(InputStream is) throws IOException{ UniversalDetector detector = new UniversalDetector(null); byte[] buf = new byte[Constants.ENCODING_HELPER_BUFFER_SIZE]; int nread; while ((nread = is.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); if (encoding != null) return encoding; return Constants.DEFAULT_ENCODING; }
Example #10
Source File: Utils.java From Man-Man with GNU General Public License v3.0 | 5 votes |
public static String detectEncodingOfArchive(File gzipped) throws IOException { FileInputStream fis = new FileInputStream(gzipped); GZIPInputStream gis = new GZIPInputStream(fis); byte[] buf = new byte[4096]; UniversalDetector detector = new UniversalDetector(null); int read; while ((read = gis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, read); } detector.dataEnd(); gis.close(); return detector.getDetectedCharset(); }
Example #11
Source File: CharsetDetector.java From WebCollector with GNU General Public License v3.0 | 5 votes |
/** * 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8 * * @param bytes 待检测的字节数组 * @return 可能的字符集,如果检测失败,返回utf-8 */ public static String guessEncodingByMozilla(byte[] bytes) { String DEFAULT_ENCODING = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); detector.handleData(bytes, 0, bytes.length); detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); if (encoding == null) { encoding = DEFAULT_ENCODING; } return encoding; }
Example #12
Source File: FileInfoReader.java From editorconfig-netbeans with MIT License | 5 votes |
protected static Charset guessCharset(FileObject fo) { Charset charset = StandardCharsets.UTF_8; byte[] buf = new byte[4096]; try (InputStream is = fo.getInputStream()) { UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = is.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding == null) { encoding = "ISO-8859-1"; } detector.reset(); charset = Charset.forName(encoding); } catch (IllegalArgumentException | IOException ex) { Exceptions.printStackTrace(ex); } return charset; }
Example #13
Source File: MakePlaylistFS.java From freemp with Apache License 2.0 | 5 votes |
@Override public void getAllTracks(Context context, boolean refresh) { this.refresh = refresh; t = System.currentTimeMillis(); String scanDir = PreferenceManager.getDefaultSharedPreferences(context).getString("scanDir", Environment.getExternalStorageDirectory().getAbsolutePath().toString()); File currentDir = new File(scanDir); tempAllTracks = (ArrayList<ClsTrack>) FileUtils.readObject("alltracksfs", context); tempAllTracksMediaStore = (ArrayList<ClsTrack>) FileUtils.readObject("alltracksms", context); if (refresh || tempAllTracksMediaStore == null || tempAllTracksMediaStore.size() == 0) { fillMediaStoreTracks = new FillMediaStoreTracks(context); tempAllTracksMediaStore = fillMediaStoreTracks.getTracks(); } if (!refresh && tempAllTracks != null && tempAllTracks.size() > 0) { allTracks = new ArrayList<ClsTrack>(tempAllTracks); } else { if (BASS.BASS_Init(-1, 44100, 0)) { String nativePath = context.getApplicationInfo().nativeLibraryDir; String[] listPlugins = new File(nativePath).list(); for (String s : listPlugins) { int plug = BASS.BASS_PluginLoad(nativePath + "/" + s, 0); } } detector = new UniversalDetector(null); walk(context, currentDir); FileUtils.writeObject("alltracksfs", context, allTracks); AQUtility.debug("time", "(ms):" + (System.currentTimeMillis() - t)); //5000 //81000 //7000 } }
Example #14
Source File: ByteUtils.java From DouBiNovel with Apache License 2.0 | 5 votes |
/** * 获取文件编码类型 * * @param bytes 文件bytes数组 * @return 编码类型 */ public static String getEncoding(byte[] bytes) { String defaultEncoding = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); detector.handleData(bytes, 0, bytes.length); detector.dataEnd(); String encoding = detector.getDetectedCharset(); detector.reset(); // log.info("字符编码是:{}", encoding); if (encoding == null) { encoding = defaultEncoding; } return encoding; }
Example #15
Source File: LocalDocReader.java From TranskribusCore with GNU General Public License v3.0 | 5 votes |
public static String readTextFromFile(File txtFile) throws IOException { byte[] buf = new byte[4096]; java.io.FileInputStream fis = new FileInputStream(txtFile); // (1) UniversalDetector detector = new UniversalDetector(null); // (2) int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } fis.close(); // (3) detector.dataEnd(); // (4) String encoding = detector.getDetectedCharset(); if (encoding != null) { logger.debug("Detected encoding = " + encoding); } else { logger.debug("No encoding detected - use utf-8"); encoding = "utf-8"; } // (5) detector.reset(); String text = FileUtils.readFileToString(txtFile, encoding); //String text = FileUtils.readFileToString(txtFile, "ISO-8859-1"); //logger.debug("text = "+text); return text; }
Example #16
Source File: LyricView.java From RetroMusicPlayer with GNU General Public License v3.0 | 5 votes |
public void setLyricFile(File file) { if (file == null || !file.exists()) { reset(); mCurrentLyricFilePath = ""; return; } else if (file.getPath().equals(mCurrentLyricFilePath)) { return; } else { mCurrentLyricFilePath = file.getPath(); reset(); } try { FileInputStream fis = new FileInputStream(file); byte[] buf = new byte[1024]; UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { setLyricFile(file, encoding); } else { setLyricFile(file, "UTF-8"); } detector.reset(); fis.close(); } catch (IOException e) { e.printStackTrace(); } }
Example #17
Source File: MakePlaylistFS.java From IdealMedia with Apache License 2.0 | 5 votes |
@Override public void getAllTracks(Context context, boolean refresh) { this.refresh = refresh; t = System.currentTimeMillis(); String scanDir = PreferenceManager.getDefaultSharedPreferences(context).getString(context.getString(R.string.key_mediapath), Environment.getExternalStorageDirectory().getAbsolutePath()); File currentDir = new File(scanDir); tempAllTracks = (ArrayList<Track>) FileUtils.read("alltracksfs", context); tempAllTracksMediaStore = (ArrayList<Track>) FileUtils.read("alltracksms", context); if (refresh || tempAllTracksMediaStore == null || tempAllTracksMediaStore.size()==0) { tempAllTracksMediaStore = new FillMediaStoreTracks(context).getTracks(); } if (!refresh && tempAllTracks != null && tempAllTracks.size() > 0) { allTracks = new ArrayList<Track>(tempAllTracks); } else { if (refresh) { if (BASS.BASS_Init(-1, 44100, 0)) { String nativePath = context.getApplicationInfo().nativeLibraryDir; String[] listPlugins = new File(nativePath).list(); for (String s : listPlugins) { BASS.BASS_PluginLoad(nativePath + "/" + s, 0); } } detector = new UniversalDetector(null); walk(currentDir); FileUtils.write("alltracksfs", context, allTracks); } else { allTracks = new ArrayList<Track>(tempAllTracksMediaStore); } } }
Example #18
Source File: HtmlUtils.java From kbase-doc with Apache License 2.0 | 5 votes |
/** * 获取文件编码 * @author eko.zhan at Jul 3, 2017 1:54:50 PM * @param file * @return * @throws IOException */ public static String getFileEncoding(File file) throws IOException{ UniversalDetector detector = new UniversalDetector(null); byte[] bytes = FileUtils.readFileToByteArray(file); detector.handleData(bytes, 0, bytes.length); detector.dataEnd(); return detector.getDetectedCharset(); }
Example #19
Source File: EncodingDetector.java From lizzie with GNU General Public License v3.0 | 5 votes |
public static String toString(InputStream is) { String encoding = "UTF-8"; try { byte[] buf = new byte[4096]; ByteArrayOutputStream output = new ByteArrayOutputStream(); UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = is.read(buf)) > 0) { output.write(buf, 0, nread); } is.close(); if (output.size() > 0) { byte[] data = output.toByteArray(); detector.handleData(data, 0, data.length); detector.dataEnd(); String detect = detector.getDetectedCharset(); if (detect != null) { encoding = detect; } detector.reset(); return new String(data, encoding); } } catch (IOException e) { } return ""; }
Example #20
Source File: LyricView.java From MusicPlayer_XiangDa with GNU General Public License v3.0 | 5 votes |
public void setLyricFile(File file) { if (file == null || !file.exists()) { reset(); mCurrentLyricFilePath = ""; return; } else if (file.getPath().equals(mCurrentLyricFilePath)) { return; } else { mCurrentLyricFilePath = file.getPath(); reset(); } try { FileInputStream fis = new FileInputStream(file); byte[] buf = new byte[1024]; UniversalDetector detector = new UniversalDetector(null); int nread; while ((nread = fis.read(buf)) > 0 && !detector.isDone()) { detector.handleData(buf, 0, nread); } detector.dataEnd(); String encoding = detector.getDetectedCharset(); if (encoding != null) { setLyricFile(file, encoding); } else { setLyricFile(file, "UTF-8"); } detector.reset(); fis.close(); } catch (IOException e) { e.printStackTrace(); } }
Example #21
Source File: Charset.java From dualsub with GNU General Public License v3.0 | 4 votes |
public Charset() { detector = new UniversalDetector(null); }
Example #22
Source File: Charset.java From dualsub with GNU General Public License v3.0 | 4 votes |
public UniversalDetector getCharsetDetector() { return detector; }
Example #23
Source File: Track.java From IdealMedia with Apache License 2.0 | 4 votes |
public static Track fromUri(Uri data) { Track t = new Track(); String path = data.getPath(); UniversalDetector detector = new UniversalDetector(null); int chan = BASS.BASS_StreamCreateFile(path, 0, 0, 0); String tags = null; for (int format = 0; format < FORMATS.length; format++) { final ByteBuffer byteBuffer = TAGS.TAGS_ReadExByte(chan, "%ARTI@%YEAR@%TRCK@%TITL@%ALBM@%COMP" + " ", FORMATS[format]); final int bufferSize = byteBuffer.capacity(); if (bufferSize < 10) continue; final ByteBuffer frameBuf = ByteBuffer.allocate(bufferSize); frameBuf.put(byteBuffer); detector.handleData(frameBuf.array(), 0, bufferSize); detector.dataEnd(); final String encoding = detector.getDetectedCharset(); boolean wrongencoding = false; try { tags = new String(frameBuf.array(), 0, bufferSize, Charset.forName(encoding)); } catch (Exception e) { wrongencoding = true; } finally { detector.reset(); } if (wrongencoding) continue; if (!TextUtils.isEmpty(tags)) { if (tags.split("@").length >= 4) return null; } } if (TextUtils.isEmpty(tags)) tags = TAGS.TAGS_Read(chan, "%UTF8(%ARTI)@%YEAR@%TRCK@%UTF8(%TITL)@%UTF8(%ALBM)@%UTF8(%COMP)" + " "); if (TextUtils.isEmpty(tags)) return null; String[] tagsArray = tags.split("@"); if (tagsArray.length <= 4) return null; tagsArray = tags.split("@"); int duration = (int) (0.5d+BASS.BASS_ChannelBytes2Seconds(chan, BASS.BASS_ChannelGetLength(chan, BASS.BASS_POS_BYTE))); t.artist = tagsArray[0]; t.title = tagsArray[3]; t.duration = duration; t.path = path; if (t.title == null || t.title == "") t.setTitle(data.getLastPathSegment()); return t; }