/* * EntropyChunker.java * * Created on March 31, 2008, 2:43 PM * */ package gr.demokritos.iit.summarization.analysis; import gr.demokritos.iit.conceptualIndex.documentModel.SymbolicGraph; import gr.demokritos.iit.conceptualIndex.structs.Distribution; import gr.demokritos.iit.jinsect.algorithms.nlp.IChunker; import gr.demokritos.iit.jinsect.algorithms.statistics.statisticalCalculation; import gr.demokritos.iit.jinsect.structs.CategorizedFileEntry; import gr.demokritos.iit.jinsect.structs.EdgeCachedLocator; import gr.demokritos.iit.jinsect.utils; import java.io.IOException; import java.io.ObjectStreamException; import java.io.Serializable; import java.io.StreamTokenizer; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import salvo.jesus.graph.Vertex; import salvo.jesus.graph.VertexImpl; import salvo.jesus.graph.WeightedEdge; /** This class can separate a token sequence into chunks, based on the entropy of the * following symbol. * * @author ggianna * @licence LGPL */ public class EntropyChunker implements Serializable, IChunker { /** The graph containing symbol sequence information. */ SymbolicGraph sgOverallGraph; EdgeCachedLocator clLocator; SortedMap smDelims; /** {@link Serializable} interface implementer. */ private void writeObject(java.io.ObjectOutputStream out) throws IOException { out.writeObject(sgOverallGraph); // Store graph out.writeObject(smDelims); // Store delims } /** {@link Serializable} interface implementer. */ private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { // Load graph sgOverallGraph = (SymbolicGraph)in.readObject(); // Load delims smDelims = (SortedMap)in.readObject(); // Re-init cache clLocator = new EdgeCachedLocator(100); } /** {@link Serializable} interface implementer. */ private void readObjectNoData() throws ObjectStreamException { return; // Do nothing } /** Creates a new instance of EntropyChunker. */ public EntropyChunker() { sgOverallGraph = new SymbolicGraph(1,1); // Re-init cache clLocator = new EdgeCachedLocator(100); smDelims = null; } /** Train the statistics of the chunker from a given file set. *@param sFiles The set of {@link CategorizedFileEntry} objects to use for *training. */ public void train(Set<String> sFileNames) { Iterator<String> iFile = sFileNames.iterator(); while (iFile.hasNext()) { String sText = utils.loadFileToString(iFile.next()); train(sText); } } /** Train the statistics of the chunker from a given text. *@param sTrainingText The text that defines the statistics used by the * chunker. */ public void train(String sTrainingText) { // Update graph sgOverallGraph.setDataString(sTrainingText + (new StringBuffer().append((char)StreamTokenizer.TT_EOF)).toString()); // Reset cache clLocator.resetCache(); // Calculate delimiters getDelimiters(); } /** Clears list of delimiters determined. */ public void clearDelimiters() { smDelims = null; } /** Returns a sorted map of delimiters, based on their entropy of next character measure. *@return The {@link SortedMap} of Delimiters, where each delimiter is matched to its entropy measure. */ public SortedMap getDelimiters() { // If extracted then return a copy if (smDelims != null) return new TreeMap(smDelims); // Else extract smDelims = identifyCandidateDelimiters(sgOverallGraph.getDataString(), 1); int iImportant = determineImportantDelimiters(smDelims); Iterator iIter = smDelims.keySet().iterator(); int iCnt = 0; while (iIter.hasNext() && (iCnt++ < smDelims.size() - iImportant)) iIter.next(); smDelims = smDelims.tailMap(iIter.next()); if (!smDelims.containsValue(StreamTokenizer.TT_EOF)) { smDelims.put((Double)smDelims.lastKey() + 0.1, new StringBuffer().append((char)StreamTokenizer.TT_EOF).toString()); // Add EOF char } // Return copy of delims return new TreeMap(smDelims); } /** Returns a list of string chunks, derived from a given string. *@param sToChunk The string to chunk. *@return A {@link List} of strings that are the chunks of the given string. */ @Override public List chunkString(String sToChunk) { Integer[] iRes = splitPointsByDelimiterList(sToChunk, getDelimiters()); String[] sRes = splitStringByDelimiterPoints(sToChunk, iRes); return Arrays.asList(sRes); } /* Returns a list of indices concerning possible split points. *@param sStr The string to split. *@param lDelimiters A {@link SortedMap} of delimiter strings. *@return An array of integers, indicating the split points for the given * string. */ protected Integer[] splitPointsByDelimiterList(String sStr, SortedMap lDelimiters) { ArrayList alRes = new ArrayList(); TreeMap lLocal = new TreeMap(); lLocal.putAll(lDelimiters); // For every candidate delimiter while (lLocal.size() > 0) { Object oNext = lLocal.lastKey(); // Get all split points int iNextSplit = 0; int iLastSplit = 0; while ((iNextSplit = sStr.indexOf((String)lDelimiters.get(oNext), iLastSplit)) > -1) { // TODO : Check alRes.add(new Integer(iNextSplit + ((String)lDelimiters.get(oNext)).length())); iLastSplit = iNextSplit + 1; } lLocal.remove(oNext); } Integer [] iaRes = new Integer[alRes.size()]; alRes.toArray(iaRes); gr.demokritos.iit.jinsect.utils.bubbleSortArray(iaRes); return iaRes; } /** Returns the substrings defined by a string and a set of split points. *@param sStr The string to split. *@param iRes An array of integers, indicating the points at which the string * is to be split. *@return An array of sub-strings of the given string. */ protected static String[] splitStringByDelimiterPoints(String sStr, Integer[] iRes) { ArrayList alRes = new ArrayList(); // For every split point get substring for (int iCnt=0; iCnt < iRes.length; iCnt++) { if (iCnt == 0) alRes.add(sStr.substring(0, iRes[iCnt])); else alRes.add(sStr.substring(iRes[iCnt - 1], iRes[iCnt])); } // Add last part if (iRes.length > 0) alRes.add(sStr.substring(iRes[iRes.length - 1])); else alRes.add(sStr); // No splitting String[] sRes = new String[alRes.size()]; // n split points => n+1 string parts alRes.toArray(sRes); return sRes; } /** Returns a list of indices concerning possible split points. *@param sStr The string to analyse. *@param lDelimiters An array of delimiting characters. *@return An array of integers, indicating split points in the given string. */ private Integer[] splitPointsByDelimiterList(String sStr, char[] lDelimiters) { TreeMap tmDels = new TreeMap(); for (int iCnt=0; iCnt < lDelimiters.length; iCnt++) tmDels.put(iCnt, new String() + lDelimiters[iCnt]); return splitPointsByDelimiterList(sStr, tmDels); } /** Returns the entropy of the next character, given a head string. *@param sStr The head string. *@return A double indicating the entropy of the next character. */ private double getEntropyOfNextChar(String sStr) { return getEntropyOfNextChar(sStr, false); } /** Returns the entropy of the next character, given a head string. Normalizes *the value if required. *@param sStr The head string. *@return A double indicating the entropy of the next character. */ private final double getEntropyOfNextChar(String sStr, boolean bNormalized) { double dRes = 0.0; // Look-up current n-gram Vertex vStrNode = clLocator.locateVertexInGraph(sgOverallGraph, new VertexImpl(sStr)); if (vStrNode == null) return dRes; // Ignore inexistent symbols // else get outgoing edges List lEdges = gr.demokritos.iit.jinsect.utils.getOutgoingEdges(sgOverallGraph, vStrNode); Iterator iEdgeIter = lEdges.iterator(); Distribution dDist = new Distribution(); if (lEdges.size() > 0) { while (iEdgeIter.hasNext()) { WeightedEdge weCur = (WeightedEdge)iEdgeIter.next(); if ( Double.isNaN(weCur.getWeight())) System.err.println("WARNING: Not a number edge weight for edge:" + weCur.toString()); dDist.setValue(weCur.toString(), weCur.getWeight()); } dDist.normalizeToSum(); if (bNormalized) { // Calc NORMALIZED entropy - entropy to the number of appearences double dLogOccurences = (Math.log(dDist.calcTotalValues()) / Math.log(2)); dRes = statisticalCalculation.entropy(dDist) / dLogOccurences; } else // Calc entropy dRes = statisticalCalculation.entropy(dDist); } if ( Double.isNaN(dRes)) System.err.println("WARNING: Not a number entropy for symbol:" + vStrNode); return dRes; } protected int determineImportantDelimiters(SortedMap smMap) { Iterator iIter = smMap.keySet().iterator(); // Distribution dEntropyDist = new Distribution(); // Distribution dEntropyDeltaDist = new Distribution(); Distribution dDist = new Distribution(); Distribution dReverse = new Distribution(); // Get first number Double dPrv = Double.NEGATIVE_INFINITY; Double dTwoPrv = Double.NEGATIVE_INFINITY; // Create corresponding distribution while (iIter.hasNext()) { Double oNext = (Double)iIter.next(); if ((dPrv != Double.NEGATIVE_INFINITY) && (dTwoPrv != Double.NEGATIVE_INFINITY)) { if (oNext.isNaN()) System.err.println("WARNING: Encountered NaN. Ignoring..."); // dEntropyDeltaDist.asTreeMap().put(smMap.get(oNext), (oNext - dPrv)); // Get distance from previous data point // dEntropyDist.asTreeMap().put(smMap.get(oNext), oNext); // Get position of current data point dDist.setValue(dPrv, dPrv * Math.abs(dPrv-dTwoPrv-oNext+dPrv)); // Detect peaks dReverse.setValue(dPrv * Math.abs(dPrv-dTwoPrv-oNext+dPrv), dPrv); } dTwoPrv = dPrv; dPrv = oNext; } // DEBUG LINES // System.err.println("Symbol\tEntropy"); // for (Iterator iEntropies = smMap.keySet().iterator(); // iEntropies.hasNext();) { // Object o = iEntropies.next(); // String sSymbol = (String)smMap.get(o); // try { // sSymbol = URLEncoder.encode(sSymbol, "utf8"); // } catch (UnsupportedEncodingException ex) { // sSymbol = "(NotPrintable)"; // ex.printStackTrace(System.err); // } // System.err.println(o.toString() + "\t" + sSymbol); // } ////////////// double dVar = dDist.variance(true); double dMean = dDist.average(true); // return getDelimiterIndexByThreshold(smMap, Math.min(dMean + Math.abs(dVar), dDist.maxValue())); return getDelimiterIndexByThreshold(smMap, dReverse.getValue(dDist.maxValue())); } private final int getDelimiterIndexByThreshold(SortedMap smMap, double dThreshold) { // Locate delim in map Iterator iIter = smMap.keySet().iterator(); int iCnt = 0; while (iIter.hasNext()) { if ((Double)iIter.next() > dThreshold) break; iCnt++; } // Indicate index return smMap.size() - iCnt + 1; } /** Returns a sorted map of candidate delimiters for a given string and a given * n-gram size. *@param sStr The string to analyse to identify the candidate delimiters. *@param iNGramSize The n-gram size of the delimiters to extract. *@return The sorted map of delimiters, sorted by their entropy of next character. */ private final SortedMap identifyCandidateDelimiters(String sStr, int iNGramSize) { String sSubStr = null; Integer[] iRes = null; ArrayList alRes = new ArrayList(); TreeMap tmRes = new TreeMap(); for (int iCnt = 0; iCnt <= sStr.length() - iNGramSize; iCnt++) { if (iCnt + iNGramSize > sStr.length()) continue; // Get n-gram sSubStr = sStr.substring(iCnt, iCnt + iNGramSize); if (tmRes.containsValue(sSubStr)) // Ignore duplicates continue; // Look-up current n-gram Vertex vStrNode = clLocator.locateVertexInGraph(sgOverallGraph, new VertexImpl(sSubStr)); if (vStrNode == null) continue; // Ignore inexistent symbols // double dNormEntropy = getEntropyOfNextChar(sSubStr, true); double dEntropy = getEntropyOfNextChar(sSubStr, false); // tmRes.put(dNormEntropy, sSubStr); tmRes.put(dEntropy, sSubStr); } return tmRes; } /** Utility method. Used for testing purposes. */ public static void main(String[] sArgs) { String sText = "this is a test text. Indeed, this previous text is nothing but a test. " + "What do you think you should do? I would try it once more by testing..."; EntropyChunker ec = new EntropyChunker(); ec.train(sText); Iterator iIter = ec.chunkString("OK. Now where do I do the splitting? Here, or here? We shall see.").iterator(); while (iIter.hasNext()) { System.out.println(iIter.next().toString()); } } }