/** * Author: WuLC * Date: 2016-05-22 17:46:15 * Last modified by: WuLC * Last Modified time: 2016-05-23 23:31:25 * Email: [email protected] ************************************************************ * Function:get keywords of file through TF-IDF algorithm * Input: path of directory of files that need to extract keywords * Output: keywords of each file */ package com.lc.nlp.keyword.algorithm; import java.io.*; import java.util.*; import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary; import com.hankcs.hanlp.seg.common.Term; import com.lc.nlp.parsedoc.*; public class TFIDF { private static int keywordsNumber = 5; /** * change the number of keywords,default 5 * @param keywordNum(int): number of keywords that need to be extracted */ public static void setKeywordsNumber(int keywordNum) { keywordsNumber = keywordNum; } /** * calculate TF value of each word in terms of the content of file * @param fileContent(String): content of file * @return(HashMap<String, Float>): "words:TF value" pairs */ public static HashMap<String, Float> getTF(String fileContent) { List<Term> terms=new ArrayList<Term>(); ArrayList<String> words = new ArrayList<String>(); terms=HanLP.segment(fileContent); for(Term t:terms) { if(TFIDF.shouldInclude(t)) { words.add(t.word); } } // get TF values HashMap<String, Integer> wordCount = new HashMap<String, Integer>(); HashMap<String, Float> TFValues = new HashMap<String, Float>(); for(String word : words) { if(wordCount.get(word) == null) { wordCount.put(word, 1); } else { wordCount.put(word, wordCount.get(word) + 1); } } int wordLen = words.size(); //traverse the HashMap Iterator<Map.Entry<String, Integer>> iter = wordCount.entrySet().iterator(); while(iter.hasNext()) { Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>)iter.next(); TFValues.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen); //System.out.println(entry.getKey().toString() + " = "+ Float.parseFloat(entry.getValue().toString()) / wordLen); } return TFValues; } /** * judge whether a word belongs to stop words * @param term(Term): word needed to be judged * @return(boolean): if the word is a stop word,return false;otherwise return true */ public static boolean shouldInclude(Term term) { return CoreStopWordDictionary.shouldInclude(term); } /** * calculate TF values for each word of each file under a directory * @param dirPath(String): path of the directory * @return(HashMap<String,HashMap<String, Float>>): path of file and its corresponding "word-TF Value" pairs * @throws IOException */ public static HashMap<String,HashMap<String, Float>> tfForDir(String dirPath) { HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>(); List<String> filelist = ReadDir.readDirFileNames(dirPath); for(String file : filelist) { HashMap<String, Float> dict = new HashMap<String, Float>(); String content = ReadFile.loadFile(file); // remember to modify the loadFile method of class ReadFile dict = TFIDF.getTF(content); allTF.put(file, dict); } return allTF; } /** * calculate IDF values for each word under a directory * @param dirPath(String): path of the directory * @return(HashMap<String, Float>): "word:IDF Value" pairs */ public static HashMap<String, Float> idfForDir(String dirPath) { List<String> fileList = new ArrayList<String>(); fileList = ReadDir.readDirFileNames(dirPath); int docNum = fileList.size(); Map<String, Set<String>> passageWords = new HashMap<String, Set<String>>(); // get words that are not repeated of a file for(String filePath:fileList) { List<Term> terms=new ArrayList<Term>(); Set<String> words = new HashSet<String>(); String content = ReadFile.loadFile(filePath); // remember to modify the loadFile method of class ReadFile terms=HanLP.segment(content); for(Term t:terms) { if(TFIDF.shouldInclude(t)) { words.add(t.word); } } passageWords.put(filePath, words); } // get IDF values HashMap<String, Integer> wordPassageNum = new HashMap<String, Integer>(); for(String filePath : fileList) { Set<String> wordSet = new HashSet<String>(); wordSet = passageWords.get(filePath); for(String word:wordSet) { if(wordPassageNum.get(word) == null) wordPassageNum.put(word,1); else wordPassageNum.put(word, wordPassageNum.get(word) + 1); } } HashMap<String, Float> wordIDF = new HashMap<String, Float>(); Iterator<Map.Entry<String, Integer>> iter_dict = wordPassageNum.entrySet().iterator(); while(iter_dict.hasNext()) { Map.Entry<String, Integer> entry = (Map.Entry<String, Integer>)iter_dict.next(); float value = (float)Math.log( docNum / (Float.parseFloat(entry.getValue().toString())) ); wordIDF.put(entry.getKey().toString(), value); //System.out.println(entry.getKey().toString() + "=" +value); } return wordIDF; } /** * calculate TF-IDF value for each word of each file under a directory * @param dirPath(String): path of the directory * @return(Map<String, HashMap<String, Float>>): path of file and its corresponding "word:TF-IDF Value" pairs */ public static Map<String, HashMap<String, Float>> getDirTFIDF(String dirPath) { HashMap<String, HashMap<String, Float>> dirFilesTF = new HashMap<String, HashMap<String, Float>>(); HashMap<String, Float> dirFilesIDF = new HashMap<String, Float>(); dirFilesTF = TFIDF.tfForDir(dirPath); dirFilesIDF = TFIDF.idfForDir(dirPath); Map<String, HashMap<String, Float>> dirFilesTFIDF = new HashMap<String, HashMap<String, Float>>(); Map<String,Float> singlePassageWord= new HashMap<String,Float>(); List<String> fileList = new ArrayList<String>(); fileList = ReadDir.readDirFileNames(dirPath); for (String filePath: fileList) { HashMap<String,Float> temp= new HashMap<String,Float>(); singlePassageWord = dirFilesTF.get(filePath); Iterator<Map.Entry<String, Float>> it = singlePassageWord.entrySet().iterator(); while(it.hasNext()) { Map.Entry<String, Float> entry = it.next(); String word = entry.getKey(); Float TFIDF = entry.getValue()*dirFilesIDF.get(word); temp.put(word, TFIDF); } dirFilesTFIDF.put(filePath, temp); } return dirFilesTFIDF; } /** * get keywords of each file under a certain directory * @param dirPath(String): path of directory * @param keywordNum(int): number of keywords to extract * @return(Map<String,List<String>>): path of file and its corresponding keywords */ public static Map<String,List<String>> getKeywords(String dirPath) { List<String> fileList = new ArrayList<String>(); fileList = ReadDir.readDirFileNames(dirPath); // calculate TF-IDF value for each word of each file under the dirPath Map<String, HashMap<String, Float>> dirTFIDF = new HashMap<String, HashMap<String, Float>>(); dirTFIDF = TFIDF.getDirTFIDF(dirPath); Map<String,List<String>> keywordsForDir = new HashMap<String,List<String>>(); for (String file:fileList) { Map<String,Float> singlePassageTFIDF= new HashMap<String,Float>(); singlePassageTFIDF = dirTFIDF.get(file); //sort the keywords in terms of TF-IDF value in descending order List<Map.Entry<String,Float>> entryList=new ArrayList<Map.Entry<String,Float>>(singlePassageTFIDF.entrySet()); Collections.sort(entryList,new Comparator<Map.Entry<String,Float>>() { @Override public int compare(Map.Entry<String,Float> c1,Map.Entry<String,Float> c2) { return c2.getValue().compareTo(c1.getValue()); } } ); // get keywords List<String> systemKeywordList=new ArrayList<String>(); for(int k=0;k<keywordsNumber;k++) { try { systemKeywordList.add(entryList.get(k).getKey()); } catch(IndexOutOfBoundsException e) { continue; } } keywordsForDir.put(file, systemKeywordList); } return keywordsForDir; } }