package edu.ehu.galan.rake; /* * RakeAlgorithm.java * Copyright (C) 2014 Angel Conde, neuw84 at gmail dot com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ import edu.ehu.galan.rake.model.AbstractAlgorithm; import edu.ehu.galan.rake.model.Document; import edu.ehu.galan.rake.model.Term; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import static java.util.stream.Collectors.toList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * * An Implementation of the RAKE (Rapid Automatic Keyword Extraction) * <i> Rose, Stuart, et al. "Automatic keyword extraction from individual * documents." Text Mining (2010): 1-20. * </i> * * This implementation is based on JATE https://code.google.com/p/jatetoolkit/ * and on https://github.com/aneesha/RAKE, it gives similar results as the * python script provided a good stopword list with a punctuation list * * The numbers have been taken into account using JATE method. The algorithm * expects that the puntuaction marks are separated within a whitespace. * " The red table , that is in front of you , is mine . " * To achieve this you should use a parser like OpenNLP, Illinois POS Tagger, * Freeling parsers etc. * * * TODO: use POS tags to avoid verbs and other unwanted type of words in the * process of keyword generation * * @author Angel Conde Manjon */ public class RakeAlgorithm extends AbstractAlgorithm { private transient Document doc = null; private final transient List<Term> termList; private List<String> stopWordList; transient private final Logger logger = LoggerFactory.getLogger(this.getClass()); private List<Pattern> regexList = null; private List<String> punctList; private int minNumberOfletters = 2; /** * */ public RakeAlgorithm() { super(true, "RAKE"); termList = super.getTermList(); stopWordList = new ArrayList<>(); regexList = new ArrayList<>(); punctList = new ArrayList<>(); } @Override public void init(Document pDoc, String pPropsDir) { setDoc(pDoc); doc = pDoc; } /** * This methods requires a list of stopwords to build a the candidate list, * will search in each different sentence for this stopwords to delimite the * candidate generation * * * @param pStopWords - a list of stopWords */ public void loadStopWordsList(List<String> pStopWords) { stopWordList = pStopWords; } /** * This method requires a list of stopwords to build a the candidate list, * will search in each different sentence for this stopwords to delimite the * candidate generation * * * @param pLoc - the location of the file where the stopwords are */ public void loadStopWordsList(String pLoc) { List<String> stops = new ArrayList<>(); try { List<String> words = Files.readAllLines(Paths.get(pLoc), StandardCharsets.UTF_8); for (String string : words) { stops.add(string.trim()); } stopWordList = stops; } catch (IOException ex) { logger.error("Error loading RAKE stopWordList from: " + pLoc, ex); } } /** * As this method uses Regex for candidate generation, custom regex * expresions could be added using this method (uses Java Pattern/Matcher * mechanism) * * @param pat */ public void addCustomRegex(Pattern pat) { regexList.add(pat); } private Pattern buildStopWordRegex(List<String> pStopWords) { StringBuilder sb = new StringBuilder(); for (String string : pStopWords) { sb.append("\\b").append(string.trim()).append("\\b").append("|"); } String pattern = sb.substring(0, sb.length() - 1); Pattern pat = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE| Pattern.UNICODE_CASE); return pat; } /** * This method works better with a list of punctuation stop list, for * example for english, spanish and in general in latin based languages the * list could be (.,/{}[];:) * * @param pLoc - the location of the file where the stopwords are */ public void loadPunctStopWord(String pLoc) { List<String> stops = new ArrayList<>(); try { List<String> words = Files.readAllLines(Paths.get(pLoc), StandardCharsets.UTF_8); for (String string : words) { stops.add(string.trim()); } punctList = stops; } catch (IOException ex) { logger.error("Error loading RAKE punctList from: " + pLoc, ex); } } /** * (OPTIONAL)This method works better with a list of punctuation stop list, * for example for english, spanish and in general in latin based languages * the list could be (.,/{}[];:) * * @param pPunt - the string list to be added */ public void loadPunctStopWord(List<String> pPunt) { punctList = pPunt; } private Pattern buildPunctStopWord(List<String> pPunctStop) { StringBuilder sb = new StringBuilder(); for (String string : pPunctStop) { sb.append("\\").append(string.trim()).append("|"); } String pattern = sb.substring(0, sb.length() - 1); Pattern pat = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE |Pattern.UNICODE_CASE); return pat; } private List<String> generateCandidateKeywords(List<String> pSentenceList, List<Pattern> pStopWordPattern) { List<String> candidates = new ArrayList<>(); StringBuffer sb = new StringBuffer(); for (String string : pSentenceList) { for (Pattern pat : pStopWordPattern) { Matcher matcher = pat.matcher(string.trim()); while (matcher.find()) { matcher.appendReplacement(sb, "|"); } matcher.appendTail(sb); if (sb.length() > 0) { string = sb.toString(); } sb = new StringBuffer(); } List<String> cands = Arrays.asList(string.split("\\|")); for (String string1 : cands) { if (string1.trim().length() > 0) { String[] p = string1.trim().split("\\s+"); if (string1.length() > 2 && p.length > 1 && !containsDigit(string1)) { candidates.add(string1.trim()); } } } } return candidates; } @Override public void runAlgorithm() { if (stopWordList.isEmpty()) { logger.error("The method " + this.getName() + " requires a StopWordList to build the candidate list"); } else { Map<String, Integer> wordfreq = new HashMap<>(); Map<String, Integer> worddegree = new HashMap<>(); Map<String, Float> wordscore = new HashMap<>(); Pattern pat = buildStopWordRegex(stopWordList); regexList.add(pat); if (!punctList.isEmpty()) { Pattern pat2 = buildPunctStopWord(punctList); regexList.add(pat2); } List<String> candidates = generateCandidateKeywords(doc.getSentenceList(), regexList); for (String phrase : candidates) { String[] wordlist = phrase.split("\\s+"); int wordlistlength = wordlist.length; int wordlistdegree = wordlistlength - 1; for (String word : wordlist) { int freq; if (wordfreq.containsKey(word) == false) { wordfreq.put(word, 1); } else { freq = wordfreq.get(word) + 1; wordfreq.remove(word); wordfreq.put(word, freq); } if (worddegree.containsKey(word) == false) { worddegree.put(word, wordlistdegree); } else { int deg = worddegree.get(word) + wordlistdegree; worddegree.remove(word); worddegree.put(word, deg); } } } for (Map.Entry<String, Integer> entry : worddegree.entrySet()) { entry.setValue(entry.getValue() + wordfreq.get(entry.getKey())); } List<Term> termLi = new ArrayList<>(); for (Map.Entry<String, Integer> entry : wordfreq.entrySet()) { wordscore.put(entry.getKey(), worddegree.get(entry.getKey()) / (wordfreq.get(entry.getKey()) * 1.0f)); } for (String phrase : candidates) { String[] words = phrase.split("\\s+"); float score = 0.0f; for (String word : words) { score += wordscore.get(word); } termLi.add(new Term(phrase, score)); } Comparator<? super Term> sorter = (o1, o2) -> o1.getScore() > o2.getScore() ? -1 : o1.getScore() == o2.getScore() ? 0 : 1; List<Term> orderedList = termLi.parallelStream().sorted(sorter).distinct().collect(toList()); doc.setTermList(orderedList); } } /** * * @return the doc */ public Document getDoc() { return doc; } /** * @param doc the doc to set */ public void setDoc(Document doc) { this.doc = doc; } /** * * Returns the current (Default 2) * * @return the minNumberOfletters required to a word to be included */ public int getMinNumberOfletters() { return minNumberOfletters; } /** * Default 2 * * @param minNumberOfletters the minNumberOfletters to set to a word to be * included */ public void setMinNumberOfletters(int minNumberOfletters) { this.minNumberOfletters = minNumberOfletters; } private boolean containsDigit(String string) { for (char c : string.toCharArray()) { if (Character.isDigit(c)) { return true; } } return false; } }