package de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.uima.custom.aes; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.EntityLinkingManager; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.access.EntityLinkingDataAccessException; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.config.ConfigUtils; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.datapreparation.dictionary.DictionaryEntity; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.datapreparation.dictionary.DictionaryEntriesDataProvider; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.datapreparation.yago.yago3.Yago3DictionaryEntriesSources; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.graph.similarity.exception.MissingSettingException; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.model.Token; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.model.Tokens; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.processor.DocumentProcessor; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.processor.UnprocessableDocumentException; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.uima.pipelines.PipelineType; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.util.CollectionUtils; import de.mpg.mpi_inf.ambiversenlu.nlu.language.Language; import de.mpg.mpi_inf.ambiversenlu.nlu.model.Document; import de.mpg.mpi_inf.ambiversenlu.nlu.model.ProcessedDocument; import org.apache.uima.UIMAException; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.*; public class UimaPOSTagger { public static Tokens tag(Language language, String text) throws UIMAException, IOException, EntityLinkingDataAccessException, NoSuchMethodException, ClassNotFoundException, MissingSettingException, UnprocessableDocumentException { return tag(language, text, PipelineType.POS_TAGGING); } public static Tokens tag(Language language, String text, PipelineType type) throws UIMAException, IOException, EntityLinkingDataAccessException, NoSuchMethodException, ClassNotFoundException, MissingSettingException, UnprocessableDocumentException { DocumentProcessor dp = DocumentProcessor.getInstance(type); Document doc = new Document.Builder().withText(text).withLanguage(language).build(); ProcessedDocument output = dp.process(doc); return output.getTokens(); } public static void main(String[] args) throws MissingSettingException, ClassNotFoundException, IOException, UnprocessableDocumentException, EntityLinkingDataAccessException, NoSuchMethodException, UIMAException { if (args.length != 3) { System.out.println("Usage: de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.uima.customComponents.aes.UimaPOSTagger <tsv source> <language - 3 code> <destination>"); return; } EntityLinkingManager.init(); new UimaPOSTagger().run(args[1], args[0], args[2]); } public void run(String language, String source, String destination) throws UIMAException, IOException, EntityLinkingDataAccessException, NoSuchMethodException, ClassNotFoundException, MissingSettingException, UnprocessableDocumentException { List<String> germanMentions = new ArrayList<>(); File file = new File(source); Scanner in = new Scanner(file); while (in.hasNextLine()) { String line = in.nextLine(); if (!line.contains("rdfs:label") || !line.contains("@" + Language.get3letterLanguage(language))) { continue; } String[] values = line.split("\t"); if (values.length < 4) { continue; } germanMentions.add(values[3].substring(values[3].indexOf('"') + 1, values[3].lastIndexOf('"'))); } Tokens test; Map<String, Integer> counts = new LinkedHashMap<>(); int totalMentions = germanMentions.size(); int processed = 0; for (String mention : germanMentions) { processed++; StringJoiner sg = new StringJoiner(" "); String text = mention + " es una entidad nombrada en nuestra base de datos."; test = tag(Language.getLanguageForString(language), text); int lenght = mention.length(); for (Token token : test.getTokens()) { if (token.getBeginIndex() <= lenght) { sg.add(token.getPOS()); } } counts.merge(sg.toString(), 1, (oldValue, one) -> oldValue + one); System.out.println("Processed " + processed + " mentions of " + totalMentions); } counts = CollectionUtils.sortMapByValue(counts, true); File fileOut = new File(destination); PrintWriter write = new PrintWriter(fileOut); for (Map.Entry<String, Integer> entry : counts.entrySet()) { write.println(entry.getKey().replaceAll(" ", "*")); } write.close(); } public void run(String language, DictionaryEntriesDataProvider dictionaryEntriesDataProvider, String destination) throws UIMAException, IOException, EntityLinkingDataAccessException, NoSuchMethodException, ClassNotFoundException, MissingSettingException, UnprocessableDocumentException { List<String> germanMentions = new ArrayList<>(); for (Map.Entry<String, List<DictionaryEntity>> e : dictionaryEntriesDataProvider) { String mention = e.getKey(); if (mention == null || "".equals(mention.trim())) { continue; } for (DictionaryEntity de : e.getValue()) { if (de.language.name().equals(language) && Yago3DictionaryEntriesSources.LABEL.equals(de.source)) { // String subject = de.entity; germanMentions.add(mention); } } } Tokens test; Map<String, Integer> counts = new LinkedHashMap<>(); int totalMentions = germanMentions.size(); int processed = 0; for (String mention : germanMentions) { processed++; StringJoiner sg = new StringJoiner(" "); String text = mention + " es una entidad nombrada en nuestra base de datos."; // test = tag(Language.getLanguageForString(language), text); test = tag(Language.getLanguageForString(language), text); int lenght = mention.length(); for (Token token : test.getTokens()) { if (token.getBeginIndex() <= lenght) { sg.add(token.getPOS()); } } counts.merge(sg.toString(), 1, (oldValue, one) -> oldValue + one); System.out.println("Processed " + processed + " mentions of " + totalMentions); } counts = CollectionUtils.sortMapByValue(counts, true); File fileOut = new File(destination); PrintWriter write = new PrintWriter(fileOut); for (Map.Entry<String, Integer> entry : counts.entrySet()) { write.println(entry.getKey().replaceAll(" ", "*")); } write.close(); } }