com.cybozu.labs.langdetect.LangDetectException Java Examples

The following examples show how to use com.cybozu.labs.langdetect.LangDetectException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractQParser.java    From SearchServices with GNU Lesser General Public License v3.0 6 votes vote down vote up
private List<DetectedLanguage> detectLanguage(String content) {
	if (content.trim().length() == 0) { // to be consistent with the tika impl?
		log.debug("No input text to detect language from, returning empty list");
		return Collections.emptyList();
	}

	try {
		Detector detector = DetectorFactory.create();
		detector.append(content);
		ArrayList<Language> langlist = detector.getProbabilities();
		ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
		for (Language l: langlist) 
		{
			if((autoDetectQueryLocales.size() == 0) || (autoDetectQueryLocales.contains(l.lang)))
			{
			    solrLangList.add(new DetectedLanguage(l.lang, l.prob));
			}
		}
		return solrLangList;
	} catch (LangDetectException e) {
		log.debug("Could not determine language, returning empty list: ", e);
		return Collections.emptyList();
	}
}
 
Example #2
Source File: LanguageDetectionService.java    From mojito with Apache License 2.0 6 votes vote down vote up
/**
 * Gets a customized detector for a given language.
 *
 * TODO(P1) Adding priority on the language seems to be relatively useless.
 * To be reviewed.
 *
 * @param language
 * @return a {@link Detector} customized for that language
 * @throws LangDetectException
 */
private Detector getDetectorForLanguage(String language) throws LangDetectException {
    Detector detector = DetectorFactory.create();
    HashMap<String, Double> priorityMap = new HashMap();

    for (String supportedLanguage : getSupportedLanguages()) {
        if (supportedLanguage.equals(language)) {
            priorityMap.put(supportedLanguage, 0.8);
        } else if (supportedLanguage.equals("en") && !"en".equals(language)) {
            priorityMap.put(supportedLanguage, 0.5);
        } else {
            priorityMap.put(supportedLanguage, 0.1);
        }
    }

    detector.setPriorMap(priorityMap);

    return detector;
}
 
Example #3
Source File: LanguageDetectionAnnotator.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    String title = getTitle(jCas);
    String text = jCas.getDocumentText();

    // add title to text if too small
    if (text.length() < minTextLenght && title.length() > 0) {
        text = title + " " + text;
    }

    // only detect if text is long enough
    if (text != null && text.length() > minTextLenght) {

        // TODO maybe cut if text too long --> slower
        try {

            jCas.setDocumentLanguage(detect(text));

        } catch (LangDetectException e) {
            LOG.warn("error detecting language for {}, {}",
                    getHeaderDocId(jCas), e);
        }
    }
}
 
Example #4
Source File: DetectionServiceImplLanguageDetection.java    From weslang with Apache License 2.0 6 votes vote down vote up
@Override
public DetectionResult detect(String text) {
  Detector detector;
  try {
    detector = DetectorFactory.create();
  } catch (LangDetectException e) {
    // TODO(skreft): log the reason
    return UNKNOWN;
  }

  detector.append(text);
  List<Language> results = detector.getProbabilities();
  if (!results.isEmpty()) {
    Language bestLang = results.get(0);
    return new DetectionResult(bestLang.lang, bestLang.prob);
  }

  return UNKNOWN;
}
 
Example #5
Source File: LanguageDetector.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Perform the detection 
 * 
 * @param text to test
 * @return the detected language
 */
public LanguageDetectionResult detectLanguage(String text) {
    try {
        Detector detector = DetectorFactory.create(0.15);
        // issue#47 correction
        detector.append(text.toLowerCase());
        ArrayList<Language> languages = detector.getProbabilities();
        Language detectedLanguage =  
                extractLangWithHighestProbability(languages);
        return new LanguageDetectionResult(detectedLanguage, text, languages.size()>1);
    } catch (LangDetectException ex) {
        LOGGER.warn(ex);
    }
    return null;
}
 
Example #6
Source File: LanguageDetector.java    From Asqatasun with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Initialise the language profiles needed by the detector. This
 * initialisation has to be performed only once.
 */
private void initProfiles() {
    PathMatchingResourcePatternResolver resolver = 
            new PathMatchingResourcePatternResolver();
    List<String> profiles = new ArrayList<>();
    DetectorFactory.setSeed(0L);
    try {
        for (Resource rs : resolver.getResources(profilePath)) {
            StringWriter writer = new StringWriter();
            IOUtils.copy(rs.getInputStream(), writer);
            profiles.add(writer.toString());
        }
        DetectorFactory.loadProfile(profiles);
    } catch (IOException | LangDetectException ex) {
        LOGGER.warn(ex);
    }
}
 
Example #7
Source File: LanguageDetectionFilter.java    From language-detection with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public void setConf(Configuration conf) {
	if (this.conf == null) {
		try {
			DetectorFactory.loadProfile(conf.get("langdetect.profile.dir"));
			textsize_upper_limit = conf.getInt("langdetect.textsize", TEXTSIZE_UPPER_LIMIT_DEFAULT);
		} catch (LangDetectException e) {
			// afterward throw when filter() is called
			cause = e;
		}
	}
	this.conf = conf;
}
 
Example #8
Source File: LanguageDetectionFilter.java    From language-detection with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example #9
Source File: LangDetectTest.java    From language-detection with Apache License 2.0 5 votes vote down vote up
@Test
public static void langDetectSample() {

    long startTime;
    String lang = "none";
    ArrayList<Language> langlist = null;

    try {

        // Initialize
        startTime = System.currentTimeMillis();
        DetectorFactory.create();
        System.out.println("Initialization finished in " + (System.currentTimeMillis() - startTime) + " ms");

        // Detect
        startTime = System.currentTimeMillis();
        Detector detector = DetectorFactory.create();
        detector.append("The quick brown fox jumps over the lazy dog.");
        lang = detector.detect();
        System.out.println("Detection finished in " + (System.currentTimeMillis() - startTime) + " ms");

        // Get probabilities
        langlist = detector.getProbabilities();

    } catch (LangDetectException e) {
        System.err.println("Detection failed");
        e.printStackTrace();
    }

    System.out.println("Detected language: " + lang);
    for (Language s : langlist) {
        System.out.println(s);
    }

}
 
Example #10
Source File: LanguageDetectionFilter.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public void setConf(Configuration conf) {
	if (this.conf == null) {
		try {
			DetectorFactory.loadProfile(conf.get("langdetect.profile.dir"));
			textsize_upper_limit = conf.getInt("langdetect.textsize", TEXTSIZE_UPPER_LIMIT_DEFAULT);
		} catch (LangDetectException e) {
			// afterward throw when filter() is called
			cause = e;
		}
	}
	this.conf = conf;
}
 
Example #11
Source File: LanguageDetectionFilter.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example #12
Source File: DetectionServiceImplLanguageDetection.java    From weslang with Apache License 2.0 5 votes vote down vote up
public DetectionServiceImplLanguageDetection() throws IOException,
                                                      UnsupportedEncodingException,
                                                      IllegalArgumentException {
  try {
    DetectorFactory.loadDefaultProfiles();
  } catch (LangDetectException e) {
    throw new IllegalArgumentException(e.getMessage());
  }
}
 
Example #13
Source File: CybozuLanguageIdentifier.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public String identifyLanguage(String html)
        throws IOException
{
    // extracting plain html text
    Document doc = Jsoup.parse(html);
    String text = doc.text();

    // we might have removed everything -> no lang
    if (text.isEmpty()) {
        return UNKNOWN_LANGUAGE;
    }

    try {
        Detector detector = DetectorFactory.create();
        detector.append(text);
        String detectedLang = detector.detect();

        ArrayList<Language> detectedProbabilities = detector.getProbabilities();

        if (detectedProbabilities.get(0).prob > PROBABILITY_THRESHOLD) {
            return detectedLang;
        }
        else {
            return UNKNOWN_LANGUAGE;
        }
    }
    catch (LangDetectException e) {
        return UNKNOWN_LANGUAGE;
    }
}
 
Example #14
Source File: LangDetectProcessorTests.java    From elasticsearch-ingest-langdetect with Apache License 2.0 5 votes vote down vote up
public void testNumbersOnlyThrowsException() throws Exception {
    Map<String, Object> config = config("source_field", "language", false);
    LangDetectException e = expectThrows(LangDetectException.class,
            () -> ingestDocument(config, "source_field", "124 56456 546 3432"));

    assertThat(e.getMessage(), is("no features in text"));
}
 
Example #15
Source File: IngestLangDetectPlugin.java    From elasticsearch-ingest-langdetect with Apache License 2.0 5 votes vote down vote up
@Override
public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
    try {
        SecureDetectorFactory.loadProfileFromClassPath(parameters.env);
    } catch (LangDetectException | URISyntaxException | IOException e) {
        throw new ElasticsearchException(e);
    }

    Map<String, Processor.Factory> factoryMap = new HashMap<>(1);
    factoryMap.put(LangDetectProcessor.TYPE, new LangDetectProcessor.Factory());
    return factoryMap;
}
 
Example #16
Source File: LangDetectLanguageIdentifierUpdateProcessorFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public static synchronized void loadData() throws IOException, LangDetectException {
  if (loaded) {
    return;
  }
  loaded = true;
  List<String> profileData = new ArrayList<>();
  for (String language : languages) {
    InputStream stream = LangDetectLanguageIdentifierUpdateProcessor.class.getResourceAsStream("langdetect-profiles/" + language);
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
    profileData.add(new String(IOUtils.toCharArray(reader)));
    reader.close();
  }
  DetectorFactory.loadProfile(profileData);
  DetectorFactory.setSeed(0);
}
 
Example #17
Source File: LanguageDetectionService.java    From mojito with Apache License 2.0 5 votes vote down vote up
public LanguageDetectionService() {

        if (DetectorFactory.getLangList().isEmpty()) {

            logger.debug("Initialize langdetect with profiles");
            List<String> jsonProfiles = new ArrayList<>();

            Resource[] resources;

            try {
                PathMatchingResourcePatternResolver pathMatchingResourcePatternResolver = new PathMatchingResourcePatternResolver();
                resources = pathMatchingResourcePatternResolver.getResources("profiles/*");
            } catch (IOException ex) {
                throw new RuntimeException("Cannot get the list of resources maching langdetect profiles", ex);
            }

            for (Resource resource : resources) {
                String filename = resource.getFilename();

                logger.debug("Add profile for: {}", filename);
                try {
                    jsonProfiles.add(Resources.toString(resource.getURL(), StandardCharsets.UTF_8));
                } catch (Exception e) {
                    throw new RuntimeException("Cannot load langdetect profile for " + filename, e);
                }
            }

            try {
                logger.debug("Load profiles");
                DetectorFactory.loadProfile(jsonProfiles);
            } catch (LangDetectException lde) {
                throw new RuntimeException("Cannot load langdetect profiles", lde);
            }
        } else {
            logger.debug("langdetect profiles are already initialized");
        }

        logger.debug("Sets langdetect supported languages");
        supportedLanguages = Collections.unmodifiableList(DetectorFactory.getLangList());
    }
 
Example #18
Source File: LanguageDetectionServiceTest.java    From mojito with Apache License 2.0 4 votes vote down vote up
@Test
public void testDetection() throws LangDetectException, IOException,UnsupportedLanguageException
{

    int nbFailed = 0;
    int nbBad = 0;
    double probabilitiesSum = 0;
    int sourceEqualsTarget = 0;
    int nbGood = 0;

    List<TextUnitDTO> translationsForLanguage = getTranslationsForLanguage("ko-KR", null);

    for (TextUnitDTO transaltionForLanguage : translationsForLanguage) {

        if(transaltionForLanguage.getTarget().equals(transaltionForLanguage.getSource())) {
            logger.debug("Skip source = target: {}", transaltionForLanguage.getSource());
            sourceEqualsTarget++;
            continue;
        }

        LanguageDetectionResult ldr = languageDetectionService.detect(transaltionForLanguage.getTarget(), transaltionForLanguage.getTargetLocale());
        probabilitiesSum += ldr.getProbability();

        if (ldr.getLangDetectException() != null) {
            nbFailed++;
            logger.info("Language detection failed for: {}", transaltionForLanguage.getTarget());
            logger.info("Error was", ldr.getLangDetectException());
            continue;
        }

        if (!ldr.isExpectedLanguage()) {
            nbBad++;
            logger.info("Not proper language, found: {} probability: {}, should be {}/{} ({}), text: {}, {}",
                    ldr.getDetected(),
                    ldr.getProbability(),
                    ldr.getExpected(),
                    transaltionForLanguage.getTargetLocale(),
                    ldr.getProbabilityExpected(),
                    transaltionForLanguage.getTarget(),
                    ldr.getDetector() != null ? ldr.getDetector().getProbabilities() : "-");

        } else {
            nbGood++;
        }
    }

    int nbDetection = translationsForLanguage.size();

    logger.info("detection: {}, source = target: {}, good: {},  bad: {}, failed: {}, average quality: {}, % good: {}",
            nbDetection, sourceEqualsTarget, nbGood, nbBad, nbFailed, probabilitiesSum / (double) nbDetection, nbGood / (double) nbDetection);
}
 
Example #19
Source File: LanguageDetectionResult.java    From mojito with Apache License 2.0 4 votes vote down vote up
public void setLangDetectException(LangDetectException langDetectException) {
    this.langDetectException = langDetectException;
}
 
Example #20
Source File: LanguageDetectionResult.java    From mojito with Apache License 2.0 4 votes vote down vote up
public LangDetectException getLangDetectException() {
    return langDetectException;
}
 
Example #21
Source File: LanguageDetectionAnnotator.java    From bluima with Apache License 2.0 3 votes vote down vote up
public static String detect(String text) throws LangDetectException {

        Detector detector = DetectorFactory.create(0.5);
        detector.append(text);

        return detector.detect();
    }