package in.bhargavrao.stackoverflow.natty.services; import com.google.common.base.Optional; import com.optimaize.langdetect.LanguageDetectorBuilder; import com.optimaize.langdetect.i18n.LdLocale; import com.optimaize.langdetect.ngram.NgramExtractors; import com.optimaize.langdetect.profiles.LanguageProfile; import com.optimaize.langdetect.profiles.LanguageProfileReader; import com.optimaize.langdetect.text.CommonTextObjectFactories; import com.optimaize.langdetect.text.TextObject; import com.optimaize.langdetect.text.TextObjectFactory; import in.bhargavrao.stackoverflow.natty.model.Post; import org.apache.tika.langdetect.OptimaizeLangDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.detect.LanguageWriter; import java.io.IOException; import java.util.List; import static in.bhargavrao.stackoverflow.natty.utils.CheckUtils.checkIfNoCodeBlock; import static in.bhargavrao.stackoverflow.natty.utils.CheckUtils.stripBody; import static in.bhargavrao.stackoverflow.natty.utils.CheckUtils.stripTags; public class NonEnglishCheckerService implements CheckerService<String> { private List<LanguageProfile> languageProfiles; private com.optimaize.langdetect.LanguageDetector optimaizeDetector; private org.apache.tika.language.detect.LanguageDetector tikaDetector; private TextObjectFactory textObjectFactory; private LanguageWriter writer; public NonEnglishCheckerService() { try { languageProfiles = new LanguageProfileReader().readAllBuiltIn(); optimaizeDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); tikaDetector = new OptimaizeLangDetector().loadModels(); writer = new LanguageWriter(tikaDetector); } catch (IOException e) { e.printStackTrace(); } } @Override public String check(Post post) { String dataToCheck = stripTags(stripBody(post)).replaceAll("\\p{Punct}+", ""); try { TextObject textObject = textObjectFactory.forText(dataToCheck); Optional<LdLocale> lang = optimaizeDetector.detect(textObject); if (!lang.isPresent()) { if(dataToCheck.length()>50) { writer.append(dataToCheck); LanguageResult result = writer.getLanguage(); String tikaLang = result.getLanguage(); writer.close(); if (!tikaLang.toLowerCase().equals("")) { return tikaLang; } else{ return null; } } else if(dataToCheck.length()<50){ return null; } if(checkIfNoCodeBlock(post)){ return "Gibberish"; } return null; } return lang.get().getLanguage(); } catch (IOException e) { e.printStackTrace(); } return null; } }