/* * Copyright (C) 2015 Adrien Guille <[email protected]> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main.java.fr.ericlab.sondy.core.text.index; import main.java.fr.ericlab.sondy.core.app.AppParameters; import main.java.fr.ericlab.sondy.core.utils.HashMapUtils; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** * * @author Adrien GUILLE, Laboratoire ERIC, Université Lumière Lyon 2 */ public final class Indexer extends Thread { int threadId; int minWordLength = 2; String directory; int from; int to; boolean mention; ArrayList<HashMap<String,Short>> mapList; HashMap<Integer,Integer> messageCountDistribution; int messageCount; public Indexer(){ } public Indexer(int id, String d, int a, int b, boolean m, int n){ directory = d; threadId = id; from = a; to = b; mention = m; } public ArrayList<String> getMostFrequentWords(String text, String sourceWord, int numberOfWords){ HashMap<String,Short> map = indexString(text); Map<String, Short> sortedMap = HashMapUtils.sortByDescValue(map); ArrayList<String> list = new ArrayList<>(); int count = 0; Set<Entry<String, Short>> entrySet = sortedMap.entrySet(); Iterator iterator = entrySet.iterator(); while(count <= numberOfWords && iterator.hasNext()){ String word = (String)((Entry)(iterator.next())).getKey(); if(!word.equals(sourceWord) && !AppParameters.stopwords.contains(word)){ list.add(word); count++; } } return list; } public HashMap<String,Short> indexString(String text){ HashMap<String,Short> map = new HashMap<>(); Analyzer analyzer = new StandardAnalyzer(); String cleanText = text.toLowerCase(); List<String> strings = Tokenizer.tokenizeString(analyzer, cleanText); for(String string : strings){ if(string.length()>=minWordLength){ Short count = map.get(string); if(count == null){ count = 0; } count++; map.put(string,count); } } return map; } public HashMap<String,Short> indexFile(int i, String filePath) { HashMap<String,Short> map = new HashMap<>(); try { List<String> lines = FileUtils.readLines(new File(filePath)); Analyzer analyzer = new StandardAnalyzer(); int messageCountFile = 0; for(String line : lines){ if(!mention || (mention && line.contains("@"))){ messageCountFile++; String cleanLine = line.toLowerCase(); List<String> strings = Tokenizer.tokenizeString(analyzer, cleanLine); for(String string : strings){ if(string.length()>=minWordLength){ Short count = map.get(string); if(count == null){ count = 0; } count++; map.put(string,count); } } } } messageCountDistribution.put(i,messageCountFile); messageCount += messageCountFile; } catch (IOException ex) { Logger.getLogger(Indexer.class.getName()).log(Level.SEVERE, null, ex); } return map; } @Override public void run() { messageCountDistribution = new HashMap<>(); mapList = new ArrayList<>(to-from+10); NumberFormat formatter = new DecimalFormat("00000000"); for(int i = from; i <= to; i++){ mapList.add(indexFile(i,directory+File.separator+formatter.format(i)+".text")); } } public static HashSet<String> getVocabulary(List<HashMap<String,Short>> mapList){ HashSet<String> vocabulary = new HashSet<>(); for(HashMap<String,Short> map : mapList){ for(String string : map.keySet()){ vocabulary.add(string); } } return vocabulary; } }