/* * Copyright (C) 2015 Adrien Guille <[email protected]> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package main.java.fr.ericlab.sondy.core.structures; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import java.io.BufferedReader; import java.io.BufferedWriter; import main.java.fr.ericlab.sondy.core.app.Configuration; import main.java.fr.ericlab.sondy.core.text.index.GlobalIndexer; import main.java.fr.ericlab.sondy.core.text.nlp.ArabicStemming; import main.java.fr.ericlab.sondy.core.text.nlp.PersianStemming; import main.java.fr.ericlab.sondy.core.utils.PropertiesFileUtils; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.ObjectInputStream; import java.nio.file.Path; import java.nio.file.Paths; import java.text.DecimalFormat; import java.text.NumberFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; import javafx.collections.FXCollections; import javafx.collections.ObservableList; import main.java.fr.ericlab.sondy.core.text.index.Tokenizer; import main.java.fr.ericlab.sondy.core.text.nlp.EnglishStemming; import main.java.fr.ericlab.sondy.core.text.nlp.FrenchStemming; import main.java.fr.ericlab.sondy.core.utils.ArrayUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; /** * * @author Adrien GUILLE, Laboratoire ERIC, Université Lumière Lyon 2 * @author Farrokh GHAMSARY */ public class Corpus { // Properties public int messageCount; public int authorCount; public Date start; public Date end; public Path path; // Preprocessed corpus public String preprocessing = ""; public int timeSliceLength; public int[] messageDistribution; public short[][] termFrequencies; public ArrayList<String> vocabulary; public short[][] termMentionFrequencies; public ArrayList<String> mentionVocabulary; public String[] splitString(String str) { return str.split("\t"); } public void loadProperties(Path p) { try { path = p; String propertiesFilePath = Paths.get(path+File.separator+"messages.properties").toString(); SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); messageCount = Integer.parseInt(PropertiesFileUtils.readProperty(propertiesFilePath, "messageCount")); authorCount = Integer.parseInt(PropertiesFileUtils.readProperty(propertiesFilePath, "authorCount")); start = dateFormat.parse(PropertiesFileUtils.readProperty(propertiesFilePath, "start")); end = dateFormat.parse(PropertiesFileUtils.readProperty(propertiesFilePath, "end")); preprocessing = ""; messageDistribution = null; termFrequencies = null; vocabulary = null; } catch (ParseException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } public String create(String id, String csvFilePath) { File dir = Paths.get(Configuration.datasets.toString() + File.separator + id).toFile(); int skippedLineCount = 0; try { Properties properties = new Properties(); HashSet<String> authors = new HashSet<>(); Date minDate = null, maxDate = null; BufferedReader bufferedReader = new BufferedReader(new FileReader(new File(csvFilePath))); messageCount = 0; SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String line; String firstLine = bufferedReader.readLine(); String[] components = splitString(firstLine); while (components.length != 3 && (firstLine = bufferedReader.readLine()) != null) { components = splitString(firstLine); skippedLineCount++; } if (components.length == 3) { authors.add(components[0]); Date parsedDate = dateFormat.parse(components[1]); minDate = maxDate = parsedDate; File messages = new File(dir.getAbsolutePath() + File.separator + "messages.csv"); FileUtils.write(messages, ""); BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(messages)); bufferedWriter.write(firstLine); bufferedWriter.newLine(); while ((line = bufferedReader.readLine()) != null) { components = splitString(line); if (components.length == 3) { authors.add(components[0]); parsedDate = dateFormat.parse(components[1]); if (parsedDate.before(minDate)) { minDate = parsedDate; } else { if (parsedDate.after(maxDate)) { maxDate = parsedDate; } } messageCount++; bufferedWriter.write(line); bufferedWriter.newLine(); } else { skippedLineCount++; } } bufferedWriter.close(); } bufferedReader.close(); properties.setProperty("messageCount", messageCount+""); properties.setProperty("authorCount", authors.size()+""); properties.setProperty("start", dateFormat.format(minDate)); properties.setProperty("end", dateFormat.format(maxDate)); PropertiesFileUtils.saveProperties(Paths.get(Configuration.datasets + File.separator + id + File.separator + "messages.properties").toString(),properties); } catch (IOException | ParseException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } return "Messages: done (imported "+messageCount+" messages, skipped "+skippedLineCount+" misformatted lines)."; } public void lemmatize(Path path) { try { LineIterator lineIterator = FileUtils.lineIterator(new File(path.toString()+File.separator+"messages.csv")); Properties props = new Properties(); props.put("annotators", "tokenize,ssplit,parse,lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation annotation; File lemmatizedFile = new File(path.toString()+File.separator+"lemmatized_messages.csv"); BufferedWriter bwLemmatizedFile = new BufferedWriter(new FileWriter(lemmatizedFile, true)); while(lineIterator.hasNext()){ String[] components = splitString(lineIterator.nextLine()); String text = components[2]; annotation = new Annotation(text); String lemmatizedText = ""; pipeline.annotate(annotation); List<CoreMap> lem = annotation.get(CoreAnnotations.SentencesAnnotation.class); for(CoreMap l: lem) { for (CoreLabel token: l.get(CoreAnnotations.TokensAnnotation.class)) { lemmatizedText += token.get(CoreAnnotations.LemmaAnnotation.class)+" "; } } if(text.contains("@")){ lemmatizedText += " @"; } bwLemmatizedFile.write(components[0]+"\t"+components[1]+"\t"+lemmatizedText+"\n"); } bwLemmatizedFile.close(); } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } public String preprocess(Path path,String stemming, String lemmatization, int ngram, int timeSliceLength){ Path preprocessPath = Paths.get(path+File.separator+stemming+"-"+lemmatization+"-"+ngram+"-"+timeSliceLength); preprocessPath.normalize(); File dir = preprocessPath.toFile(); File sourceFile = new File(path.toString()+File.separator+"messages.csv"); if(lemmatization.equals("English")){ File lemmatizedFile = new File(path.toString()+File.separator+"lemmatized_messages.csv"); if(!lemmatizedFile.exists()){ lemmatize(path); } sourceFile = lemmatizedFile; } if(!dir.exists()){ try { dir.mkdir(); BufferedReader bufferedReader = new BufferedReader(new FileReader(sourceFile)); BufferedWriter bwText = null, bwTime = null, bwAuthor = null; SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long startTime = start.getTime(); NumberFormat formatter = new DecimalFormat("00000000"); Analyzer analyzer = new StandardAnalyzer(); String line; int timeSlice = -1; while ((line = bufferedReader.readLine()) != null) { String[] components = splitString(line); Date parsedDate = dateFormat.parse(components[1]); double diff = (parsedDate.getTime() - startTime) / (60 * 1000); if (timeSlice != (int) (diff / timeSliceLength)) { timeSlice = (int) (diff / timeSliceLength); if (bwText != null) { bwText.close(); } File fileText = new File(preprocessPath + File.separator + formatter.format(timeSlice) + ".text"); if (!fileText.exists()) { FileUtils.write(fileText, ""); } bwText = new BufferedWriter(new FileWriter(fileText, true)); if (bwTime != null) { bwTime.close(); } File fileTime = new File(preprocessPath + File.separator + formatter.format(timeSlice) + ".time"); if (!fileTime.exists()) { FileUtils.write(fileText, ""); } bwTime = new BufferedWriter(new FileWriter(fileTime, true)); if (bwAuthor != null) { bwAuthor.close(); } File fileAuthor = new File(preprocessPath + File.separator + formatter.format(timeSlice) + ".author"); if (!fileAuthor.exists()) { FileUtils.write(fileAuthor, ""); } bwAuthor = new BufferedWriter(new FileWriter(fileAuthor, true)); } String text = components[2]; if (!stemming.equals("disabled")) { String newText = ""; List<String> tokenList = Tokenizer.tokenizeString(analyzer, text); switch (stemming) { case "French": FrenchStemming frenchStemming = new FrenchStemming(); for (String token : tokenList) newText += frenchStemming.stem(token) + " "; break; case "Arabic": ArabicStemming arabicStemming = new ArabicStemming(); for (String token : tokenList) newText += arabicStemming.stem(token) + " "; break; case "Persian": PersianStemming persianStemming = new PersianStemming(); for (String token : tokenList) newText += persianStemming.stem(token) + " "; break; case "English": EnglishStemming englishStemming = new EnglishStemming(); for (String token : tokenList) newText += englishStemming.stem(token) + " "; default: break; } text = newText; } if (ngram > 1) { String newText = ""; List<String> tokenList = Tokenizer.tokenizeString(analyzer, text); for (int token = 0; token < tokenList.size() - 1 - ngram; token++) { for (int n = 0; n < ngram; n++) { newText += tokenList.get(token + n); if (n == ngram - 1) { newText += " "; } else { newText += "_"; } } } text = newText; } bwText.write(text); bwText.newLine(); bwTime.write(components[1]); bwTime.newLine(); bwAuthor.write(components[0]); bwAuthor.newLine(); } bwText.close(); bwTime.close(); bwAuthor.close(); bufferedReader.close(); GlobalIndexer indexer = new GlobalIndexer(Configuration.numberOfCores, false); indexer.index(preprocessPath.toString()); indexer = new GlobalIndexer(Configuration.numberOfCores, true); indexer.index(preprocessPath.toString()); return "Done."; } catch (IOException | ParseException | InterruptedException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } return "Error:"; } public void loadFrequencies(String preprocessedCorpus){ preprocessing = preprocessedCorpus; FileInputStream fisMatrix = null; try { setTimeSliceLength(); fisMatrix = new FileInputStream(path+File.separator+preprocessing+File.separator+"indexes/frequencyMatrix.dat"); ObjectInputStream oisMatrix = new ObjectInputStream(fisMatrix); termFrequencies = (short[][]) oisMatrix.readObject(); FileInputStream fisVocabulary = new FileInputStream(path+File.separator+preprocessing+File.separator+"indexes/vocabulary.dat"); ObjectInputStream oisVocabulary = new ObjectInputStream(fisVocabulary); vocabulary = (ArrayList<String>) oisVocabulary.readObject(); FileInputStream fisDistribution = new FileInputStream(path+File.separator+preprocessing+File.separator+"indexes/messageCountDistribution.dat"); ObjectInputStream oisDistribution = new ObjectInputStream(fisDistribution); messageDistribution = (int[]) oisDistribution.readObject(); } catch (FileNotFoundException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException | ClassNotFoundException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } finally { try { fisMatrix.close(); } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } } public void loadMentionFrequencies(){ FileInputStream fisMatrix = null; try { setTimeSliceLength(); fisMatrix = new FileInputStream(path+File.separator+preprocessing+File.separator+"indexes/mentionFrequencyMatrix.dat"); ObjectInputStream oisMatrix = new ObjectInputStream(fisMatrix); termMentionFrequencies = (short[][]) oisMatrix.readObject(); FileInputStream fisVocabulary = new FileInputStream(path+File.separator+preprocessing+File.separator+"indexes/mentionVocabulary.dat"); ObjectInputStream oisVocabulary = new ObjectInputStream(fisVocabulary); mentionVocabulary = (ArrayList<String>) oisVocabulary.readObject(); } catch (FileNotFoundException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException | ClassNotFoundException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } finally { try { fisMatrix.close(); } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } } public short[] getTermFrequency(String term){ int i = vocabulary.indexOf(term); return termFrequencies[i]; } public short[] getTermMentionFrequency(String term){ int i = mentionVocabulary.indexOf(term); return termMentionFrequencies[i]; } public String getMessages(String term, int timeSliceA, int timeSliceB){ String messages = ""; NumberFormat formatter = new DecimalFormat("00000000"); for(int i = timeSliceA; i <= timeSliceB; i++){ try { File textFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".text"); List<String> lines = FileUtils.readLines(textFile); for(String line : lines){ if(StringUtils.containsIgnoreCase(line,term)){ messages += line+"\n"; } } } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } return messages; } public ObservableList<Message> getMessages(Event event){ ObservableList<Message> messages = FXCollections.observableArrayList(); String[] interval = event.getTemporalDescription().split(","); int timeSliceA = convertDayToTimeSlice(Double.parseDouble(interval[0])); int timeSliceB = convertDayToTimeSlice(Double.parseDouble(interval[1])); String term = event.getTextualDescription().split(" ")[0]; NumberFormat formatter = new DecimalFormat("00000000"); for(int i = timeSliceA; i <= timeSliceB; i++){ try { File textFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".text"); File timeFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".time"); File authorFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".author"); LineIterator textIter = FileUtils.lineIterator(textFile); LineIterator timeIter = FileUtils.lineIterator(timeFile); LineIterator authorIter = FileUtils.lineIterator(authorFile); while(textIter.hasNext()){ String text = textIter.nextLine(); String author = authorIter.nextLine(); String time = timeIter.nextLine(); if(StringUtils.containsIgnoreCase(text,term)){ messages.add(new Message(author,time,text)); } } } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } return messages; } public ObservableList<Message> getFilteredMessages(Event event, String[] words, int operator){ ObservableList<Message> messages = FXCollections.observableArrayList(); String[] interval = event.getTemporalDescription().split(","); int timeSliceA = convertDayToTimeSlice(Double.parseDouble(interval[0])); int timeSliceB = convertDayToTimeSlice(Double.parseDouble(interval[1])); String term = event.getTextualDescription().split(" ")[0]; NumberFormat formatter = new DecimalFormat("00000000"); for(int i = timeSliceA; i <= timeSliceB; i++){ try { File textFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".text"); File timeFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".time"); File authorFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".author"); LineIterator textIter = FileUtils.lineIterator(textFile); LineIterator timeIter = FileUtils.lineIterator(timeFile); LineIterator authorIter = FileUtils.lineIterator(authorFile); while(textIter.hasNext()){ String text = textIter.nextLine(); short[] test = new short[words.length]; for(int j = 0; j < words.length; j++){ if(StringUtils.containsIgnoreCase(text,words[j])){ test[j] = 1; }else{ test[j] = 0; } } if(StringUtils.containsIgnoreCase(text,term)){ int testSum = ArrayUtils.sum(test, 0, test.length-1); String author = authorIter.nextLine(); String time = timeIter.nextLine(); if(operator==0 && testSum == test.length){ messages.add(new Message(author,time,text)); } if(operator==1 && testSum > 0){ messages.add(new Message(author,time,text)); } } } } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } return messages; } public ObservableList<Message> getMessages(String user){ ObservableList<Message> messages = FXCollections.observableArrayList(); try { File messagesFile = new File(path.toString() + File.separator + "messages.csv"); LineIterator lineIterator = FileUtils.lineIterator(messagesFile); while (lineIterator.hasNext()) { String line = lineIterator.nextLine(); String[] components = splitString(line); if(components[0].equals(user)){ messages.add(new Message(components[0],components[1],components[2])); } } return messages; } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } return messages; } public HashSet<String> getAuthors(Event event){ HashSet<String> authors = new HashSet<>(); String[] interval = event.getTemporalDescription().split(","); int timeSliceA = convertDayToTimeSlice(Double.parseDouble(interval[0])); int timeSliceB = convertDayToTimeSlice(Double.parseDouble(interval[1])); String term = event.getTextualDescription().split(" ")[0]; NumberFormat formatter = new DecimalFormat("00000000"); for(int i = timeSliceA; i <= timeSliceB; i++){ try { File textFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".text"); File authorFile = new File(path+File.separator+preprocessing+File.separator+formatter.format(i)+".author"); LineIterator textIter = FileUtils.lineIterator(textFile); LineIterator authorIter = FileUtils.lineIterator(authorFile); while(textIter.hasNext()){ String text = textIter.nextLine(); String author = authorIter.nextLine(); if(text.contains(term)){ authors.add(author); } } } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } } return authors; } public int getNumberOfTermsInTimeSlice(int timeSlice){ int count = 0; try { NumberFormat formatter = new DecimalFormat("00000000"); List<String> lines = FileUtils.readLines(new File(path+File.separator+preprocessing+File.separator+formatter.format(timeSlice)+".text")); for(String line : lines){ for(int i = 0; i < line.length(); i++) if(Character.isWhitespace(line.charAt(i))) count++; count++; } } catch (IOException ex) { Logger.getLogger(Corpus.class.getName()).log(Level.SEVERE, null, ex); } return count; } public void setTimeSliceLength(){ String[] components = preprocessing.split("-"); timeSliceLength = Integer.parseInt(components[3]); } public double convertTimeSliceToDay(int timeSlice){ double norm = (((double)timeSliceLength)/60.0)/24.0; return ((double)timeSlice) * norm; } public int convertDayToTimeSlice(double day){ double norm = 24*60/(double)timeSliceLength; int timeSlice = (int) Math.round(day * norm); return timeSlice; } public double getLength(){ return (end.getTime() - start.getTime())/(1000*60*60*24L)+1; } }