com.optimaize.langdetect.profiles.LanguageProfile Java Examples

The following examples show how to use com.optimaize.langdetect.profiles.LanguageProfile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OptimaizeDetector.java    From vespa with Apache License 2.0 6 votes vote down vote up
static private void initOptimaize() {
    synchronized (initGuard) {
        if ((textObjectFactory != null) && (languageDetector != null)) return;

        // origin: https://github.com/optimaize/language-detector
        // load all languages:
        List<LanguageProfile> languageProfiles;
        try {
            languageProfiles = new LanguageProfileReader().readAllBuiltIn();
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }

        //build language detector:
        languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
                                                  .withProfiles(languageProfiles)
                                                  .build();

        //create a text object factory
        textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
    }
}
 
Example #2
Source File: Translator.java    From KaellyBot with GNU General Public License v3.0 6 votes vote down vote up
private static LanguageDetector getLanguageDetector(){
    if (languageDetector == null){
        try {
            List<String> languages = new ArrayList<>();
            for(Language lg : Language.values())
                languages.add(lg.getAbrev().toLowerCase());

            List<LanguageProfile> languageProfiles = new LanguageProfileReader().read(languages);
            languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
                            .withProfiles(languageProfiles).build();
        }
        catch (IOException e) {
            LOG.error("Translator.getLanguageDetector", e);
        }
    }
    return languageDetector;
}
 
Example #3
Source File: VideoBlocker.java    From SkyTube with GNU General Public License v3.0 5 votes vote down vote up
private LanguageDetectionSingleton() throws IOException {
	// load all languages
	List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();

	// build language detector
	languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
			.withProfiles(languageProfiles)
			.build();

	// create a text object factory
	textObjectFactory = CommonTextObjectFactories.forDetectingShortCleanText();
}
 
Example #4
Source File: LanguageDetectorImplTest.java    From language-detector with Apache License 2.0 5 votes vote down vote up
private LanguageDetector makeNewDetector() throws IOException {
    LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard())
        .shortTextAlgorithm(50)
        .prefixFactor(1.5)
        .suffixFactor(2.0);

    LangProfileReader langProfileReader = new LangProfileReader();
    for (String language : ImmutableList.of("en", "fr", "nl", "de")) {
        LangProfile langProfile = langProfileReader.read(LanguageDetectorImplTest.class.getResourceAsStream("/languages/" + language));
        LanguageProfile languageProfile = OldLangProfileConverter.convert(langProfile);
        builder.withProfile(languageProfile);
    }

    return builder.build();
}
 
Example #5
Source File: DataLanguageDetectorImplTest.java    From language-detector with Apache License 2.0 5 votes vote down vote up
public DataLanguageDetectorImplTest() throws IOException {
    List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();

    shortDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
            .shortTextAlgorithm(100)
            .withProfiles(languageProfiles)
            .build();

    longDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
            .shortTextAlgorithm(0)
            .withProfiles(new LanguageProfileReader().readAllBuiltIn())
            .build();
}
 
Example #6
Source File: CommandLineInterface.java    From language-detector with Apache License 2.0 5 votes vote down vote up
/**
 * Using all language profiles from the given directory.
 */
private LanguageDetector makeDetector() throws IOException {
    double alpha = getParamDouble("alpha", DEFAULT_ALPHA);
    String profileDirectory = requireParamString("directory") + "/";
    Optional<Long> seed = Optional.fromNullable(getParamLongOrNull("seed"));

    List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAll(new File(profileDirectory));

    return LanguageDetectorBuilder.create(NgramExtractors.standard())
            .alpha(alpha)
            .seed(seed)
            .shortTextAlgorithm(50)
            .withProfiles(languageProfiles)
            .build();
}
 
Example #7
Source File: NgramFrequencyData.java    From language-detector with Apache License 2.0 5 votes vote down vote up
/**
 * @param gramLengths for example [1,2,3]
 * @throws java.lang.IllegalArgumentException if languageProfiles or gramLengths is empty, or if one of the
 *         languageProfiles does not have the grams of the required sizes.
 */
@NotNull
public static NgramFrequencyData create(@NotNull Collection<LanguageProfile> languageProfiles, @NotNull Collection<Integer> gramLengths) throws IllegalArgumentException {
    if (languageProfiles.isEmpty()) throw new IllegalArgumentException("No languageProfiles provided!");
    if (gramLengths.isEmpty()) throw new IllegalArgumentException("No gramLengths provided!");

    Map<String, double[]> wordLangProbMap = new HashMap<>();
    List<LdLocale> langlist = new ArrayList<>();
    int langsize = languageProfiles.size();

    int index = -1;
    for (LanguageProfile profile : languageProfiles) {
        index++;

        langlist.add( profile.getLocale() );

        for (Integer gramLength : gramLengths) {
            if (!profile.getGramLengths().contains(gramLength)) {
                throw new IllegalArgumentException("The language profile for "+profile.getLocale()+" does not contain "+gramLength+"-grams!");
            }
            for (Map.Entry<String, Integer> ngramEntry : profile.iterateGrams(gramLength)) {
                String ngram      = ngramEntry.getKey();
                Integer frequency = ngramEntry.getValue();
                if (!wordLangProbMap.containsKey(ngram)) {
                    wordLangProbMap.put(ngram, new double[langsize]);
                }
                double prob = frequency.doubleValue() / profile.getNumGramOccurrences(ngram.length());
                wordLangProbMap.get(ngram)[index] = prob;
            }
        }
    }

    return new NgramFrequencyData(wordLangProbMap, langlist);
}
 
Example #8
Source File: LanguageProfileValidator.java    From language-detector with Apache License 2.0 5 votes vote down vote up
/**
 * Remove potential LanguageProfiles, e.g. in combination with {@link #loadAllBuiltInLanguageProfiles()}.
 * @param isoString the ISO string of the LanguageProfile to be removed.
 */
public LanguageProfileValidator removeLanguageProfile(final String isoString) {
    Iterables.removeIf(this.languageProfiles, new Predicate<LanguageProfile>() {
        @Override
        public boolean apply(LanguageProfile languageProfile) {
            return languageProfile.getLocale().getLanguage().equals(isoString);
        }
    });
    return this;
}
 
Example #9
Source File: LanguageDetectorBuilder.java    From language-detector with Apache License 2.0 5 votes vote down vote up
/**
 * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
 */
public LanguageDetectorBuilder withProfiles(Iterable<LanguageProfile> languageProfiles) throws IllegalStateException {
    for (LanguageProfile languageProfile : languageProfiles) {
        withProfile(languageProfile);
    }
    return this;
}
 
Example #10
Source File: LanguageDetectorBuilder.java    From language-detector with Apache License 2.0 5 votes vote down vote up
/**
 * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
 */
public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) throws IllegalStateException {
    if (langsAdded.contains(languageProfile.getLocale())) {
        throw new IllegalStateException("A language profile for language "+languageProfile.getLocale()+" was added already!");
    }
    for (Integer gramLength : ngramExtractor.getGramLengths()) {
        if (!languageProfile.getGramLengths().contains(gramLength)) {
            throw new IllegalArgumentException("The NgramExtractor is set to handle "+gramLength+"-grams but the given language profile for "+languageProfile.getLocale()+" does not support this!");
        }
    }
    langsAdded.add(languageProfile.getLocale());
    languageProfiles.add(languageProfile);
    return this;
}
 
Example #11
Source File: DocumentLanguage.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
  try {
    List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
    languageDetector =
        LanguageDetectorBuilder.create(NgramExtractors.standard())
            .withProfiles(languageProfiles)
            .build();

    textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
  } catch (IOException ioe) {
    throw new ResourceInitializationException(ioe);
  }
}
 
Example #12
Source File: LanguageDetectorBuilder.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
/**
 * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
 */
public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) throws IllegalStateException {
    if (langsAdded.contains(languageProfile.getLocale())) {
        throw new IllegalStateException("A language profile for language " + languageProfile.getLocale() + " was added already!");
    }
    for (Integer gramLength : ngramExtractor.getGramLengths()) {
        if (!languageProfile.getGramLengths().contains(gramLength)) {
            throw new IllegalArgumentException("The NgramExtractor is set to handle " + gramLength + "-grams but the given language profile for " + languageProfile.getLocale() + " does not support this!");
        }
    }
    langsAdded.add(languageProfile.getLocale());
    languageProfiles.add(languageProfile);
    return this;
}
 
Example #13
Source File: LanguageDetectorImplTest.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
private LanguageDetector makeNewDetector() throws IOException {
    LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()).shortTextAlgorithm(50).prefixFactor(1.5).suffixFactor(2.0);

    LangProfileReader langProfileReader = new LangProfileReader();
    for (String language : ImmutableList.of("en", "fr", "nl", "de")) {
        LangProfile langProfile = langProfileReader.read(LanguageDetectorImplTest.class.getResourceAsStream("/languages/" + language));
        LanguageProfile languageProfile = OldLangProfileConverter.convert(langProfile);
        builder.withProfile(languageProfile);
    }

    return builder.build();
}
 
Example #14
Source File: CommandLineInterface.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
/**
 * Using all language profiles from the given directory.
 */
private LanguageDetector makeDetector() throws IOException {
    double alpha = getParamDouble("alpha", DEFAULT_ALPHA);
    String profileDirectory = requireParamString("directory") + "/";
    Optional<Long> seed = Optional.fromNullable(getParamLongOrNull("seed"));

    List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAll(new File(profileDirectory));

    return LanguageDetectorBuilder.create(NgramExtractors.standard()).alpha(alpha).seed(seed).shortTextAlgorithm(50).withProfiles(languageProfiles).build();
}
 
Example #15
Source File: NgramFrequencyData.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
/**
 * @param gramLengths for example [1,2,3]
 * @throws java.lang.IllegalArgumentException if languageProfiles or gramLengths is empty, or if one of the languageProfiles does not have the grams of the required sizes.
 */
@NotNull
public static NgramFrequencyData create(@NotNull Collection<LanguageProfile> languageProfiles, @NotNull Collection<Integer> gramLengths) throws IllegalArgumentException {
    if (languageProfiles.isEmpty())
        throw new IllegalArgumentException("No languageProfiles provided!");
    if (gramLengths.isEmpty())
        throw new IllegalArgumentException("No gramLengths provided!");

    Map<String, double[]> wordLangProbMap = new HashMap<>();
    List<Locale> langlist = new ArrayList<>();
    int langsize = languageProfiles.size();

    int index = -1;
    for (LanguageProfile profile : languageProfiles) {
        index++;

        langlist.add(profile.getLocale());

        for (Integer gramLength : gramLengths) {
            if (!profile.getGramLengths().contains(gramLength)) {
                throw new IllegalArgumentException("The language profile for " + profile.getLocale() + " does not contain " + gramLength + "-grams!");
            }
            for (Map.Entry<String, Integer> ngramEntry : profile.iterateGrams(gramLength)) {
                String ngram = ngramEntry.getKey();
                Integer frequency = ngramEntry.getValue();
                if (!wordLangProbMap.containsKey(ngram)) {
                    wordLangProbMap.put(ngram, new double[langsize]);
                }
                double prob = frequency.doubleValue() / profile.getNumGramOccurrences(ngram.length());
                wordLangProbMap.get(ngram)[index] = prob;
            }
        }
    }

    return new NgramFrequencyData(wordLangProbMap, langlist);
}
 
Example #16
Source File: LanguageProfileValidator.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
/**
 * Remove potential LanguageProfiles, e.g. in combination with {@link #loadAllBuiltInLanguageProfiles()}.
 * 
 * @param isoString the ISO string of the LanguageProfile to be removed.
 */
public LanguageProfileValidator removeLanguageProfile(final String isoString) {
    Iterables.removeIf(this.languageProfiles, new Predicate<LanguageProfile>() {
        @Override
        public boolean apply(LanguageProfile languageProfile) {
            return languageProfile.getLocale().getLanguage().equals(isoString);
        }
    });
    return this;
}
 
Example #17
Source File: LanguageDetectorBuilder.java    From jstarcraft-nlp with Apache License 2.0 5 votes vote down vote up
/**
 * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
 */
public LanguageDetectorBuilder withProfiles(Iterable<LanguageProfile> languageProfiles) throws IllegalStateException {
    for (LanguageProfile languageProfile : languageProfiles) {
        withProfile(languageProfile);
    }
    return this;
}
 
Example #18
Source File: NgramFrequencyDataTest.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
private static NgramFrequencyData forAll(int gramSize) throws IOException {
    List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
    return NgramFrequencyData.create(languageProfiles, ImmutableSet.of(gramSize));
}
 
Example #19
Source File: LanguageProfileValidator.java    From language-detector with Apache License 2.0 4 votes vote down vote up
/**
 * Load the given {@link LanguageProfile}.
 */
public LanguageProfileValidator loadLanguageProfile(LanguageProfile languageProfile) {
    this.languageProfiles.add(languageProfile);
    return this;
}
 
Example #20
Source File: LanguageProfileValidator.java    From language-detector with Apache License 2.0 4 votes vote down vote up
/**
 * Load the given {@link LanguageProfile}s.
 */
public LanguageProfileValidator loadLanguageProfiles(Collection<LanguageProfile> languageProfiles) {
    this.languageProfiles.addAll(languageProfiles);
    return this;
}
 
Example #21
Source File: LanguageProfileValidator.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
/**
 * Load the given {@link LanguageProfile}s.
 */
public LanguageProfileValidator loadLanguageProfiles(Collection<LanguageProfile> languageProfiles) {
    this.languageProfiles.addAll(languageProfiles);
    return this;
}
 
Example #22
Source File: NgramFrequencyDataTest.java    From language-detector with Apache License 2.0 4 votes vote down vote up
private static NgramFrequencyData forAll(int gramSize) throws IOException {
    List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
    return NgramFrequencyData.create(languageProfiles, ImmutableSet.of(gramSize));
}
 
Example #23
Source File: LanguageProfileValidator.java    From jstarcraft-nlp with Apache License 2.0 4 votes vote down vote up
/**
 * Load the given {@link LanguageProfile}.
 */
public LanguageProfileValidator loadLanguageProfile(LanguageProfile languageProfile) {
    this.languageProfiles.add(languageProfile);
    return this;
}
 
Example #24
Source File: DataLanguageDetectorImplTest.java    From jstarcraft-nlp with Apache License 2.0 3 votes vote down vote up
public DataLanguageDetectorImplTest() throws IOException {
    List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();

    shortDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).shortTextAlgorithm(100).withProfiles(languageProfiles).build();

    longDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).shortTextAlgorithm(0).withProfiles(new LanguageProfileReader().readAllBuiltIn()).build();
}