package lucene4ir.BiGramGenerator; import javax.xml.bind.JAXB; import javax.xml.bind.annotation.XmlRootElement; import java.io.File; import java.io.PrintWriter; import java.util.HashMap; import java.util.Iterator; import java.util.Map; public class BiGramGenerator { // Public Variables public BiGramGeneratorParams p; public HashMap<String,queryInfo> biGramMap, uniGramMap; private final int maxShingles = 2; private double log2; private queryInfo currentQryInfo ; private String currentGram , terms[] , sourceParameterFile; // Constructor Method public BiGramGenerator(String inputParameterFile) { if (inputParameterFile.isEmpty()) sourceParameterFile = "params/BiGramGenerator.xml"; else sourceParameterFile = inputParameterFile; biGramMap = new HashMap<String, queryInfo>(); uniGramMap = new HashMap<String, queryInfo>(); } private void displayMsg(String msg) { System.out.println(msg); System.exit(0); } private long getUnigramFrequency (String term) { long result = 1; // Default 1 if (uniGramMap.containsKey(term)) result = uniGramMap.get(term).collFreq; return result; } private void readParamsFromFile() { System.out.println("Reading Param File"); try { p = JAXB.unmarshal(new File(sourceParameterFile), BiGramGeneratorParams.class ); if (p.indexName.toString().isEmpty()) displayMsg ("IndexName Parameter is Missing"); System.out.println("Index: " + p.indexName); if (p.outFilePath.toString().isEmpty()) displayMsg ("Query Output Path Parameter is Missing"); if (p.cutoff < 1) { p.cutoff = 0; } System.out.println("biGram Cutoff: " + p.cutoff); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); System.exit(1); } } private void readFilterBigrams() throws Exception { /* This Function is Used to Read Bigrams and Filter Them based on input cutoff and required input field */ String fieldName = lucene4ir.Lucene4IRConstants.FIELD_ALL; DumpTermsApp dump = new DumpTermsApp(); dump.indexName = p.indexName; dump.openReader(); dump.getGramList(fieldName, biGramMap,p.cutoff, uniGramMap); } private void iterateGrams(int gramSize) throws Exception { /* Iterate Through Grams : 1- Calculate Score 2- Output Grams */ Iterator it; Map.Entry currentItem; String shortOutFile , longOutFile , outputPrefix = "" , line; int qryID; // Initialize Variables; if(gramSize == 1) { // Unigram if (uniGramMap.size() < 1) return; it = uniGramMap.entrySet().iterator(); outputPrefix = "Uni"; } else { // Bigram if (biGramMap.size() < 1) return; outputPrefix = "Bi"; it = biGramMap.entrySet().iterator(); log2 = Math.log10(2); } shortOutFile = String.format("%s/short%sgram.txt", p.outFilePath, outputPrefix); longOutFile = String.format("%s/long%sgram.txt", p.outFilePath, outputPrefix); line = String.format("\n\n\n%sGrams\n---------\n" , outputPrefix); PrintWriter prShort = new PrintWriter(shortOutFile); PrintWriter prLong = new PrintWriter(longOutFile); System.out.print(line); qryID = 1; while (it.hasNext()) { currentItem = (Map.Entry) it.next(); currentGram = currentItem.getKey().toString(); calculateScore(gramSize,currentItem); // Output line = qryID + " " + currentGram ; prShort.write(line + "\n"); if (gramSize == 1) line += " " + currentQryInfo.collFreq + " " + currentQryInfo.weight + "\n"; else { line += String.format(" %d %d %d %f\n" , getUnigramFrequency(terms[0]) , getUnigramFrequency(terms[1]) , currentQryInfo.collFreq , currentQryInfo.weight ); } // End Else qryID++; prLong.write(line); System.out.print(line); } // End While prLong.close(); prShort.close(); } private void calculateScore(int gramSize , Map.Entry currentItem) { double score , pi , pj , pij , currentWeight ; long v1 , v2 , currentFreq , biGramSize , uniGramSize; currentQryInfo = (queryInfo) currentItem.getValue(); currentFreq = currentQryInfo.collFreq; if (gramSize == 1) score = currentFreq; else { // Bigram Score biGramSize = biGramMap.size(); uniGramSize = uniGramMap.size(); pij = currentFreq * 1.0 / biGramSize; terms = currentGram.split(" "); v1 = getUnigramFrequency(terms[0]); v2 = getUnigramFrequency(terms[1]); pi = v1 * 1.0 / uniGramSize; pj = v2 * 1.0 / uniGramSize; score = Math.log10(pij / (pi * pj)) / log2; } currentQryInfo.weight = score; } public void main() { // Reading Parameters from Retrievability Counter XML File try { readParamsFromFile(); readFilterBigrams(); iterateGrams(2); } catch (Exception ex) { System.out.println("RunTime Error During Query Generation"); System.out.println(ex.getMessage()); } } public static void main (String args[]) { BiGramGenerator qg = new BiGramGenerator(""); qg.main(); } // SubClasses class queryInfo { long collFreq; double weight; } @XmlRootElement(name = "BiGramGeneratorParams") static public class BiGramGeneratorParams { public String indexName , outFilePath; public int cutoff ; } }