Java Code Examples for org.mozilla.universalchardet.UniversalDetector#isDone()

The following examples show how to use org.mozilla.universalchardet.UniversalDetector#isDone() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CrawlUtils.java    From Asqatasun with GNU Affero General Public License v3.0 7 votes vote down vote up
/**
 * This method extracts the charset from the html source code.
 * If the charset is not specified, it is set to UTF-8 by default
 * @param is
 * @return
 */
public static String extractCharset(InputStream is) throws java.io.IOException {
    byte[] buf = new byte[4096];
    UniversalDetector detector = new UniversalDetector(null);
    int nread;
    while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
        detector.handleData(buf, 0, nread);
    }
    detector.dataEnd();

    String encoding = detector.getDetectedCharset();
    if (encoding != null) {
        LOGGER.debug("Detected encoding = " + encoding);
    } else {
        LOGGER.debug("No encoding detected.");
    }

    detector.reset();
    if (encoding != null && CrawlUtils.isValidCharset(encoding)) {
        return encoding;
    } else {
        return DEFAULT_CHARSET;
    }
}
 
Example 2
Source File: CsvImporter.java    From fingen with Apache License 2.0 6 votes vote down vote up
private String detectCharset() throws IOException {
    String result = "UTF-8";
    UniversalDetector detector = new UniversalDetector(null);
    byte[] buf = new byte[4096];

    try {
        FileInputStream fis = new FileInputStream(mFileName);
        int nread;
        while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
            detector.handleData(buf, 0, nread);
        }
        fis.close();
        detector.dataEnd();
        String encoding = detector.getDetectedCharset();
        if (encoding != null) {
            result = encoding;
        }

        detector.reset();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result;
}
 
Example 3
Source File: UniversalEncodingDetector.java    From onedev with MIT License 6 votes vote down vote up
public static boolean isBinary(InputStream in) throws IOException {
    byte[] buf = new byte[4];
    in.mark(5);
    int len = in.read(buf);
    in.reset();

    UniversalDetector detector = new UniversalDetector(null);
    detector.handleData(buf, 0, len);
    if (detector.isDone()) {
        return false;
    }

    //Not UTF check ASCII text
    in.mark(LOOKAHEAD);
    len = 0;
    int b;
    while ((b = in.read()) != -1 && len < (LOOKAHEAD - 192)) {
        len++;
        if (b == 0) {
            in.reset();
            return true;
        }
    }
    in.reset();
    return false;
}
 
Example 4
Source File: Charset.java    From dualsub with GNU General Public License v3.0 6 votes vote down vote up
public static String detect(InputStream inputStream) throws IOException {
	UniversalDetector detector = Charset.getSingleton()
			.getCharsetDetector();
	byte[] buf = new byte[4096];
	int nread;
	while ((nread = inputStream.read(buf)) > 0 && !detector.isDone()) {
		detector.handleData(buf, 0, nread);
	}
	detector.dataEnd();
	String encoding = detector.getDetectedCharset();
	detector.reset();
	inputStream.close();
	if (encoding == null) {
		// If none encoding is detected, we assume UTF-8
		encoding = UTF8;
	}
	return encoding;
}
 
Example 5
Source File: TaskIo.java    From jdotxt with GNU General Public License v3.0 6 votes vote down vote up
private static String detectEncoding(File file) throws IOException {
	byte[] buf = new byte[4096];

	FileInputStream  fis = new FileInputStream(file);
	UniversalDetector detector = new UniversalDetector(null);
	
    int nread;
    while ((nread = fis.read(buf)) > 0 && !detector.isDone()) detector.handleData(buf, 0, nread);

    Util.closeStream(fis);
    
    detector.dataEnd();
    String encoding = detector.getDetectedCharset();
    
    if (encoding == null) encoding = DEFAULT_ENCODING;
	return encoding;
}
 
Example 6
Source File: LyricView.java    From MusicPlayer_XiangDa with GNU General Public License v3.0 5 votes vote down vote up
public void setLyricFile(File file) {

        if (file == null || !file.exists()) {
            reset();
            mCurrentLyricFilePath = "";
            return;
        } else if (file.getPath().equals(mCurrentLyricFilePath)) {
            return;
        } else {
            mCurrentLyricFilePath = file.getPath();
            reset();
        }
        try {

            FileInputStream fis = new FileInputStream(file);
            byte[] buf = new byte[1024];
            UniversalDetector detector = new UniversalDetector(null);
            int nread;
            while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
                detector.handleData(buf, 0, nread);
            }

            detector.dataEnd();
            String encoding = detector.getDetectedCharset();
            if (encoding != null) {
                setLyricFile(file, encoding);
            } else {
                setLyricFile(file, "UTF-8");
            }
            detector.reset();
            fis.close();

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
Example 7
Source File: LyricView.java    From RetroMusicPlayer with GNU General Public License v3.0 5 votes vote down vote up
public void setLyricFile(File file) {

        if (file == null || !file.exists()) {
            reset();
            mCurrentLyricFilePath = "";
            return;
        } else if (file.getPath().equals(mCurrentLyricFilePath)) {
            return;
        } else {
            mCurrentLyricFilePath = file.getPath();
            reset();
        }
        try {

            FileInputStream fis = new FileInputStream(file);
            byte[] buf = new byte[1024];
            UniversalDetector detector = new UniversalDetector(null);
            int nread;
            while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
                detector.handleData(buf, 0, nread);
            }

            detector.dataEnd();
            String encoding = detector.getDetectedCharset();
            if (encoding != null) {
                setLyricFile(file, encoding);
            } else {
                setLyricFile(file, "UTF-8");
            }
            detector.reset();
            fis.close();

        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
Example 8
Source File: LocalDocReader.java    From TranskribusCore with GNU General Public License v3.0 5 votes vote down vote up
public static String readTextFromFile(File txtFile) throws IOException {
	byte[] buf = new byte[4096]; 
	java.io.FileInputStream fis = new FileInputStream(txtFile);

	// (1)
	UniversalDetector detector = new UniversalDetector(null);

	// (2)
	int nread;
	while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
	  detector.handleData(buf, 0, nread);
	}
	fis.close();
	// (3)
	detector.dataEnd();

	// (4)
	String encoding = detector.getDetectedCharset();
	if (encoding != null) {
	  logger.debug("Detected encoding = " + encoding);
	} else {
	  logger.debug("No encoding detected - use utf-8");
	  encoding = "utf-8";
	}

	// (5)
	detector.reset();

	String text = FileUtils.readFileToString(txtFile, encoding);
	//String text = FileUtils.readFileToString(txtFile, "ISO-8859-1");
	//logger.debug("text = "+text);
	
	return text;
}
 
Example 9
Source File: FileInfoReader.java    From editorconfig-netbeans with MIT License 5 votes vote down vote up
protected static Charset guessCharset(FileObject fo) {
  Charset charset = StandardCharsets.UTF_8;
  byte[] buf = new byte[4096];

  try (InputStream is = fo.getInputStream()) {
    UniversalDetector detector = new UniversalDetector(null);

    int nread;
    while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
      detector.handleData(buf, 0, nread);
    }

    detector.dataEnd();

    String encoding = detector.getDetectedCharset();

    if (encoding == null) {
      encoding = "ISO-8859-1";
    }

    detector.reset();

    charset = Charset.forName(encoding);
  } catch (IllegalArgumentException | IOException ex) {
    Exceptions.printStackTrace(ex);
  }

  return charset;
}
 
Example 10
Source File: Utils.java    From Man-Man with GNU General Public License v3.0 5 votes vote down vote up
public static String detectEncodingOfArchive(File gzipped) throws IOException {
    FileInputStream fis = new FileInputStream(gzipped);
    GZIPInputStream gis = new GZIPInputStream(fis);
    byte[] buf = new byte[4096];

    UniversalDetector detector = new UniversalDetector(null);
    int read;
    while ((read = gis.read(buf)) > 0 && !detector.isDone()) {
        detector.handleData(buf, 0, read);
    }
    detector.dataEnd();
    gis.close();

    return detector.getDetectedCharset();
}
 
Example 11
Source File: FileStorable.java    From Readily with MIT License 5 votes vote down vote up
public static String guessCharset(InputStream is) throws IOException{
	UniversalDetector detector = new UniversalDetector(null);
	byte[] buf = new byte[Constants.ENCODING_HELPER_BUFFER_SIZE];
	int nread;
	while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
		detector.handleData(buf, 0, nread);
	}
	detector.dataEnd();
	String encoding = detector.getDetectedCharset();
	detector.reset();
	if (encoding != null)
		return encoding;
	return Constants.DEFAULT_ENCODING;
}