org.mozilla.universalchardet.UniversalDetector Java Examples

The following examples show how to use org.mozilla.universalchardet.UniversalDetector. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CrawlUtils.java    From Asqatasun with GNU Affero General Public License v3.0 7 votes vote down vote up
/**
 * This method extracts the charset from the html source code.
 * If the charset is not specified, it is set to UTF-8 by default
 * @param is
 * @return
 */
public static String extractCharset(InputStream is) throws java.io.IOException {
    byte[] buf = new byte[4096];
    UniversalDetector detector = new UniversalDetector(null);
    int nread;
    while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
        detector.handleData(buf, 0, nread);
    }
    detector.dataEnd();

    String encoding = detector.getDetectedCharset();
    if (encoding != null) {
        LOGGER.debug("Detected encoding = " + encoding);
    } else {
        LOGGER.debug("No encoding detected.");
    }

    detector.reset();
    if (encoding != null && CrawlUtils.isValidCharset(encoding)) {
        return encoding;
    } else {
        return DEFAULT_CHARSET;
    }
}
 
Example #2
Source File: TaskIo.java    From jdotxt with GNU General Public License v3.0 6 votes vote down vote up
private static String detectEncoding(File file) throws IOException {
	byte[] buf = new byte[4096];

	FileInputStream  fis = new FileInputStream(file);
	UniversalDetector detector = new UniversalDetector(null);
	
    int nread;
    while ((nread = fis.read(buf)) > 0 && !detector.isDone()) detector.handleData(buf, 0, nread);

    Util.closeStream(fis);
    
    detector.dataEnd();
    String encoding = detector.getDetectedCharset();
    
    if (encoding == null) encoding = DEFAULT_ENCODING;
	return encoding;
}
 
Example #3
Source File: CsvImporter.java    From fingen with Apache License 2.0 6 votes vote down vote up
private String detectCharset() throws IOException {
    String result = "UTF-8";
    UniversalDetector detector = new UniversalDetector(null);
    byte[] buf = new byte[4096];

    try {
        FileInputStream fis = new FileInputStream(mFileName);
        int nread;
        while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
            detector.handleData(buf, 0, nread);
        }
        fis.close();
        detector.dataEnd();
        String encoding = detector.getDetectedCharset();
        if (encoding != null) {
            result = encoding;
        }

        detector.reset();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result;
}
 
Example #4
Source File: UniversalEncodingDetector.java    From onedev with MIT License 6 votes vote down vote up
public static boolean isBinary(InputStream in) throws IOException {
    byte[] buf = new byte[4];
    in.mark(5);
    int len = in.read(buf);
    in.reset();

    UniversalDetector detector = new UniversalDetector(null);
    detector.handleData(buf, 0, len);
    if (detector.isDone()) {
        return false;
    }

    //Not UTF check ASCII text
    in.mark(LOOKAHEAD);
    len = 0;
    int b;
    while ((b = in.read()) != -1 && len < (LOOKAHEAD - 192)) {
        len++;
        if (b == 0) {
            in.reset();
            return true;
        }
    }
    in.reset();
    return false;
}
 
Example #5
Source File: BaseParser.java    From substitution-schedule-parser with Mozilla Public License 2.0 6 votes vote down vote up
BaseParser(SubstitutionScheduleData scheduleData, CookieProvider cookieProvider) {
    this.scheduleData = scheduleData;
    this.cookieProvider = cookieProvider;
    this.cookieStore = new BasicCookieStore();
    this.colorProvider = new ColorProvider(scheduleData);
    this.encodingDetector = new UniversalDetector(null);
    this.debuggingDataHandler = new NoOpDebuggingDataHandler();
    this.sardine = null;

    try {
        SSLConnectionSocketFactory sslsf = getSslConnectionSocketFactory(scheduleData);

        CloseableHttpClient httpclient = HttpClients.custom()
                .setSSLSocketFactory(sslsf)
                .setRedirectStrategy(new LaxRedirectStrategy())
                .setDefaultRequestConfig(RequestConfig.custom()
                        .setCookieSpec(CookieSpecs.STANDARD).build())
                .setUserAgent(
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
                .build();
        this.executor = Executor.newInstance(httpclient).use(cookieStore);
    } catch (GeneralSecurityException | JSONException | IOException e) {
        throw new RuntimeException(e);
    }
}
 
Example #6
Source File: Charset.java    From dualsub with GNU General Public License v3.0 6 votes vote down vote up
public static String detect(InputStream inputStream) throws IOException {
	UniversalDetector detector = Charset.getSingleton()
			.getCharsetDetector();
	byte[] buf = new byte[4096];
	int nread;
	while ((nread = inputStream.read(buf)) > 0 && !detector.isDone()) {
		detector.handleData(buf, 0, nread);
	}
	detector.dataEnd();
	String encoding = detector.getDetectedCharset();
	detector.reset();
	inputStream.close();
	if (encoding == null) {
		// If none encoding is detected, we assume UTF-8
		encoding = UTF8;
	}
	return encoding;
}
 
Example #7
Source File: CrawlerPack.java    From CrawlerPack with Apache License 2.0 5 votes vote down vote up
/**
 * Detecting real content encoding
 * @param content
 * @param offset
 * @return real charset encoding
 */
private String detectCharset(byte[] content, Integer offset){
    log.debug("detectCharset: offset=" + offset);

    // detect failed
    if( offset > content.length ) return null;

    UniversalDetector detector = new UniversalDetector(null);
    detector.handleData(content, offset, content.length - offset > detectBuffer ? detectBuffer : content.length - offset);
    detector.dataEnd();

    String detectEncoding = detector.getDetectedCharset();

    return null==detectEncoding?detectCharset(content,offset+detectBuffer):detectEncoding;
}
 
Example #8
Source File: CharsetDetector.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
/**
 * Attempts to figure out the character set of the document using
 * the excellent juniversalchardet library.
 * 
 * @param resource
 * @return String character encoding found, or null if nothing looked good.
 * @throws IOException
 */
protected String getCharsetFromBytes(byte buffer[], int len) 
throws IOException {
	String charsetName = null;
    UniversalDetector detector = new UniversalDetector(null);
	detector.handleData(buffer, 0, len);
	detector.dataEnd();
    charsetName = detector.getDetectedCharset();
    detector.reset();
    if(isCharsetSupported(charsetName)) {
    	return mapCharset(charsetName);
    }
    return null;
}
 
Example #9
Source File: FileStorable.java    From Readily with MIT License 5 votes vote down vote up
public static String guessCharset(InputStream is) throws IOException{
	UniversalDetector detector = new UniversalDetector(null);
	byte[] buf = new byte[Constants.ENCODING_HELPER_BUFFER_SIZE];
	int nread;
	while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
		detector.handleData(buf, 0, nread);
	}
	detector.dataEnd();
	String encoding = detector.getDetectedCharset();
	detector.reset();
	if (encoding != null)
		return encoding;
	return Constants.DEFAULT_ENCODING;
}
 
Example #10
Source File: Utils.java    From Man-Man with GNU General Public License v3.0 5 votes vote down vote up
public static String detectEncodingOfArchive(File gzipped) throws IOException {
    FileInputStream fis = new FileInputStream(gzipped);
    GZIPInputStream gis = new GZIPInputStream(fis);
    byte[] buf = new byte[4096];

    UniversalDetector detector = new UniversalDetector(null);
    int read;
    while ((read = gis.read(buf)) > 0 && !detector.isDone()) {
        detector.handleData(buf, 0, read);
    }
    detector.dataEnd();
    gis.close();

    return detector.getDetectedCharset();
}
 
Example #11
Source File: CharsetDetector.java    From WebCollector with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 根据字节数组,猜测可能的字符集,如果检测失败,返回utf-8
 *
 * @param bytes 待检测的字节数组
 * @return 可能的字符集,如果检测失败,返回utf-8
 */
public static String guessEncodingByMozilla(byte[] bytes) {
    String DEFAULT_ENCODING = "UTF-8";
    UniversalDetector detector = new UniversalDetector(null);
    detector.handleData(bytes, 0, bytes.length);
    detector.dataEnd();
    String encoding = detector.getDetectedCharset();
    detector.reset();
    if (encoding == null) {
        encoding = DEFAULT_ENCODING;
    }
    return encoding;
}
 
Example #12
Source File: FileInfoReader.java    From editorconfig-netbeans with MIT License 5 votes vote down vote up
protected static Charset guessCharset(FileObject fo) {
  Charset charset = StandardCharsets.UTF_8;
  byte[] buf = new byte[4096];

  try (InputStream is = fo.getInputStream()) {
    UniversalDetector detector = new UniversalDetector(null);

    int nread;
    while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
      detector.handleData(buf, 0, nread);
    }

    detector.dataEnd();

    String encoding = detector.getDetectedCharset();

    if (encoding == null) {
      encoding = "ISO-8859-1";
    }

    detector.reset();

    charset = Charset.forName(encoding);
  } catch (IllegalArgumentException | IOException ex) {
    Exceptions.printStackTrace(ex);
  }

  return charset;
}
 
Example #13
Source File: MakePlaylistFS.java    From freemp with Apache License 2.0 5 votes vote down vote up
@Override
public void getAllTracks(Context context, boolean refresh) {
    this.refresh = refresh;
    t = System.currentTimeMillis();
    String scanDir = PreferenceManager.getDefaultSharedPreferences(context).getString("scanDir", Environment.getExternalStorageDirectory().getAbsolutePath().toString());
    File currentDir = new File(scanDir);

    tempAllTracks = (ArrayList<ClsTrack>) FileUtils.readObject("alltracksfs", context);
    tempAllTracksMediaStore = (ArrayList<ClsTrack>) FileUtils.readObject("alltracksms", context);

    if (refresh || tempAllTracksMediaStore == null || tempAllTracksMediaStore.size() == 0) {
        fillMediaStoreTracks = new FillMediaStoreTracks(context);
        tempAllTracksMediaStore = fillMediaStoreTracks.getTracks();
    }

    if (!refresh && tempAllTracks != null && tempAllTracks.size() > 0) {
        allTracks = new ArrayList<ClsTrack>(tempAllTracks);
    } else {
        if (BASS.BASS_Init(-1, 44100, 0)) {
            String nativePath = context.getApplicationInfo().nativeLibraryDir;
            String[] listPlugins = new File(nativePath).list();
            for (String s : listPlugins) {
                int plug = BASS.BASS_PluginLoad(nativePath + "/" + s, 0);
            }
        }
        detector = new UniversalDetector(null);

        walk(context, currentDir);

        FileUtils.writeObject("alltracksfs", context, allTracks);

        AQUtility.debug("time", "(ms):" + (System.currentTimeMillis() - t)); //5000 //81000  //7000
    }
}
 
Example #14
Source File: ByteUtils.java    From DouBiNovel with Apache License 2.0 5 votes vote down vote up
/**
     * 获取文件编码类型
     *
     * @param bytes 文件bytes数组
     * @return      编码类型
     */
    public static String getEncoding(byte[] bytes) {
        String defaultEncoding = "UTF-8";
        UniversalDetector detector = new UniversalDetector(null);
        detector.handleData(bytes, 0, bytes.length);
        detector.dataEnd();
        String encoding = detector.getDetectedCharset();
        detector.reset();
//        log.info("字符编码是:{}", encoding);
        if (encoding == null) {
            encoding = defaultEncoding;
        }
        return encoding;
    }
 
Example #15
Source File: LocalDocReader.java    From TranskribusCore with GNU General Public License v3.0 5 votes vote down vote up
public static String readTextFromFile(File txtFile) throws IOException {
	byte[] buf = new byte[4096]; 
	java.io.FileInputStream fis = new FileInputStream(txtFile);

	// (1)
	UniversalDetector detector = new UniversalDetector(null);

	// (2)
	int nread;
	while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
	  detector.handleData(buf, 0, nread);
	}
	fis.close();
	// (3)
	detector.dataEnd();

	// (4)
	String encoding = detector.getDetectedCharset();
	if (encoding != null) {
	  logger.debug("Detected encoding = " + encoding);
	} else {
	  logger.debug("No encoding detected - use utf-8");
	  encoding = "utf-8";
	}

	// (5)
	detector.reset();

	String text = FileUtils.readFileToString(txtFile, encoding);
	//String text = FileUtils.readFileToString(txtFile, "ISO-8859-1");
	//logger.debug("text = "+text);
	
	return text;
}
 
Example #16
Source File: LyricView.java    From RetroMusicPlayer with GNU General Public License v3.0 5 votes vote down vote up
public void setLyricFile(File file) {

        if (file == null || !file.exists()) {
            reset();
            mCurrentLyricFilePath = "";
            return;
        } else if (file.getPath().equals(mCurrentLyricFilePath)) {
            return;
        } else {
            mCurrentLyricFilePath = file.getPath();
            reset();
        }
        try {

            FileInputStream fis = new FileInputStream(file);
            byte[] buf = new byte[1024];
            UniversalDetector detector = new UniversalDetector(null);
            int nread;
            while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
                detector.handleData(buf, 0, nread);
            }

            detector.dataEnd();
            String encoding = detector.getDetectedCharset();
            if (encoding != null) {
                setLyricFile(file, encoding);
            } else {
                setLyricFile(file, "UTF-8");
            }
            detector.reset();
            fis.close();

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
Example #17
Source File: MakePlaylistFS.java    From IdealMedia with Apache License 2.0 5 votes vote down vote up
@Override
public void getAllTracks(Context context, boolean refresh) {
    this.refresh = refresh;
    t = System.currentTimeMillis();
    String scanDir = PreferenceManager.getDefaultSharedPreferences(context).getString(context.getString(R.string.key_mediapath), Environment.getExternalStorageDirectory().getAbsolutePath());
    File currentDir = new File(scanDir);

    tempAllTracks = (ArrayList<Track>) FileUtils.read("alltracksfs", context);
    tempAllTracksMediaStore = (ArrayList<Track>) FileUtils.read("alltracksms", context);

    if (refresh || tempAllTracksMediaStore == null || tempAllTracksMediaStore.size()==0) {
        tempAllTracksMediaStore =  new FillMediaStoreTracks(context).getTracks();
    }

    if (!refresh && tempAllTracks != null && tempAllTracks.size() > 0) {
        allTracks = new ArrayList<Track>(tempAllTracks);
    }
    else {
        if (refresh) {
            if (BASS.BASS_Init(-1, 44100, 0)) {
                String nativePath = context.getApplicationInfo().nativeLibraryDir;
                String[] listPlugins = new File(nativePath).list();
                for (String s : listPlugins) {
                    BASS.BASS_PluginLoad(nativePath + "/" + s, 0);
                }
            }
            detector = new UniversalDetector(null);

            walk(currentDir);

            FileUtils.write("alltracksfs", context, allTracks);
        } else {
            allTracks = new ArrayList<Track>(tempAllTracksMediaStore);
        }
    }
}
 
Example #18
Source File: HtmlUtils.java    From kbase-doc with Apache License 2.0 5 votes vote down vote up
/**
 * 获取文件编码
 * @author eko.zhan at Jul 3, 2017 1:54:50 PM
 * @param file
 * @return
 * @throws IOException 
 */
public static String getFileEncoding(File file) throws IOException{
	UniversalDetector detector = new UniversalDetector(null);
	byte[] bytes = FileUtils.readFileToByteArray(file);
	detector.handleData(bytes, 0, bytes.length);
	detector.dataEnd();
	return detector.getDetectedCharset();
}
 
Example #19
Source File: EncodingDetector.java    From lizzie with GNU General Public License v3.0 5 votes vote down vote up
public static String toString(InputStream is) {
  String encoding = "UTF-8";
  try {
    byte[] buf = new byte[4096];
    ByteArrayOutputStream output = new ByteArrayOutputStream();

    UniversalDetector detector = new UniversalDetector(null);

    int nread;
    while ((nread = is.read(buf)) > 0) {
      output.write(buf, 0, nread);
    }
    is.close();
    if (output.size() > 0) {
      byte[] data = output.toByteArray();
      detector.handleData(data, 0, data.length);
      detector.dataEnd();

      String detect = detector.getDetectedCharset();
      if (detect != null) {
        encoding = detect;
      }
      detector.reset();

      return new String(data, encoding);
    }

  } catch (IOException e) {
  }

  return "";
}
 
Example #20
Source File: LyricView.java    From MusicPlayer_XiangDa with GNU General Public License v3.0 5 votes vote down vote up
public void setLyricFile(File file) {

        if (file == null || !file.exists()) {
            reset();
            mCurrentLyricFilePath = "";
            return;
        } else if (file.getPath().equals(mCurrentLyricFilePath)) {
            return;
        } else {
            mCurrentLyricFilePath = file.getPath();
            reset();
        }
        try {

            FileInputStream fis = new FileInputStream(file);
            byte[] buf = new byte[1024];
            UniversalDetector detector = new UniversalDetector(null);
            int nread;
            while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
                detector.handleData(buf, 0, nread);
            }

            detector.dataEnd();
            String encoding = detector.getDetectedCharset();
            if (encoding != null) {
                setLyricFile(file, encoding);
            } else {
                setLyricFile(file, "UTF-8");
            }
            detector.reset();
            fis.close();

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
Example #21
Source File: Charset.java    From dualsub with GNU General Public License v3.0 4 votes vote down vote up
public Charset() {
	detector = new UniversalDetector(null);
}
 
Example #22
Source File: Charset.java    From dualsub with GNU General Public License v3.0 4 votes vote down vote up
public UniversalDetector getCharsetDetector() {
	return detector;
}
 
Example #23
Source File: Track.java    From IdealMedia with Apache License 2.0 4 votes vote down vote up
public static Track fromUri(Uri data) {
    Track t = new Track();

    String path = data.getPath();

    UniversalDetector detector = new UniversalDetector(null);
    int chan = BASS.BASS_StreamCreateFile(path, 0, 0, 0);

    String tags = null;
    for (int format = 0; format < FORMATS.length; format++) {
        final ByteBuffer byteBuffer = TAGS.TAGS_ReadExByte(chan, "%ARTI@%YEAR@%TRCK@%TITL@%ALBM@%COMP" + " ", FORMATS[format]);

        final int bufferSize = byteBuffer.capacity();
        if (bufferSize < 10)
            continue;

        final ByteBuffer frameBuf = ByteBuffer.allocate(bufferSize);
        frameBuf.put(byteBuffer);

        detector.handleData(frameBuf.array(), 0, bufferSize);
        detector.dataEnd();
        final String encoding = detector.getDetectedCharset();
        boolean wrongencoding = false;
        try {
            tags = new String(frameBuf.array(), 0, bufferSize, Charset.forName(encoding));
        } catch (Exception e) {
            wrongencoding = true;
        } finally {
            detector.reset();
        }
        if (wrongencoding)
            continue;
        if (!TextUtils.isEmpty(tags)) {
            if (tags.split("@").length >= 4)
                return null;
        }
    }

    if (TextUtils.isEmpty(tags))
        tags = TAGS.TAGS_Read(chan, "%UTF8(%ARTI)@%YEAR@%TRCK@%UTF8(%TITL)@%UTF8(%ALBM)@%UTF8(%COMP)" + " ");

    if (TextUtils.isEmpty(tags))
        return null;

    String[] tagsArray = tags.split("@");
    if (tagsArray.length <= 4)
        return null;

    tagsArray = tags.split("@");
    int duration = (int) (0.5d+BASS.BASS_ChannelBytes2Seconds(chan, BASS.BASS_ChannelGetLength(chan, BASS.BASS_POS_BYTE)));

    t.artist = tagsArray[0];
    t.title = tagsArray[3];
    t.duration = duration;
    t.path = path;

    if (t.title == null || t.title == "")
        t.setTitle(data.getLastPathSegment());

    return t;
}