package uk.ac.shef.dcs.jate.app; import org.apache.solr.core.SolrCore; import org.apache.solr.search.SolrIndexSearcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.algorithm.ChiSquare; import uk.ac.shef.dcs.jate.feature.*; import uk.ac.shef.dcs.jate.model.JATETerm; import java.io.IOException; import java.nio.file.Paths; import java.util.List; import java.util.Map; public class AppChiSquare extends App { private final Logger log = LoggerFactory.getLogger(getClass()); //top 30% of the terms are considered to be 'frequent' private double frequentTermFT = 0.3; /** * @param args command-line params accepting solr home path, solr core name * <p> * and more optional run-time parameters * @see uk.ac.shef.dcs.jate.app.AppParams * <p> * Chisquare specific setting: frequent term cutoff percentage * @see uk.ac.shef.dcs.jate.app.AppParams#CHISQUERE_FREQ_TERM_CUTOFF_PERCENTAGE */ public static void main(String[] args) throws JATEException { if (args.length < 1) { printHelp(); System.exit(1); } String solrHomePath = args[args.length - 2]; String solrCoreName = args[args.length - 1]; Map<String, String> params = getParams(args); String jatePropertyFile = getJATEProperties(params); String corpusDir = getCorpusDir(params); List<JATETerm> terms; try { App app = new AppChiSquare(params); if (isCorpusProvided(corpusDir)) { app.index(Paths.get(corpusDir), Paths.get(solrHomePath), solrCoreName, jatePropertyFile); } terms = app.extract(solrHomePath, solrCoreName, jatePropertyFile); if (isExport(params)) { app.write(terms); } } catch (IOException e) { e.printStackTrace(); } finally { System.exit(0); } } /** * @param initParams initial parameters including pre-filtering and post-filtering parameters * and chisquare specific parameter * @throws JATEException * @see uk.ac.shef.dcs.jate.app.AppParams */ public AppChiSquare(Map<String, String> initParams) throws JATEException { super(initParams); initializeFTParam(initParams); } /** * @param initParams chisquare specific initial parameter * @throws JATEException * @see uk.ac.shef.dcs.jate.app.AppParams#CHISQUERE_FREQ_TERM_CUTOFF_PERCENTAGE */ private void initializeFTParam(Map<String, String> initParams) throws JATEException { //This param is Chi-Square only String sFT = initParams.get(AppParams.CHISQUERE_FREQ_TERM_CUTOFF_PERCENTAGE.getParamKey()); if (sFT != null) { try { frequentTermFT = Double.parseDouble(sFT); if (frequentTermFT > 1.0 || frequentTermFT <= 0.0) throw new JATEException("Frequent Term cutoff percentage ('-ft') is not set correctly! " + "Value must be within (0,1.0]"); } catch (NumberFormatException nfe) { throw new JATEException("Frequent Term cutoff percentage ('-ft') is not set correctly! " + "A decimal value is expected!"); } } } @Override public List<JATETerm> extract(SolrCore core, String jatePropertyFile) throws IOException, JATEException { JATEProperties properties = getJateProperties(jatePropertyFile); return extract(core, properties); } public List<JATETerm> extract(SolrCore core, JATEProperties properties) throws JATEException { log.info("extract terms from core ... "); if (core.isClosed()) { core.open(); } SolrIndexSearcher searcher = core.getSearcher().get(); // try { FrequencyTermBasedFBMaster ftbb = new FrequencyTermBasedFBMaster(searcher, properties, 0); FrequencyTermBased ft = (FrequencyTermBased) ftbb.build(); //sentence is a context FrequencyCtxSentenceBasedFBMaster fcsbb = new FrequencyCtxSentenceBasedFBMaster(searcher, properties, 0); FrequencyCtxBased fcs = (FrequencyCtxBased) fcsbb.build(); FrequencyCtxBased ref_fcs = (FrequencyCtxBased) (new FrequencyCtxBasedCopier(searcher, properties, fcs, ft, frequentTermFT).build()); //window is a context /*FrequencyCtxWindowBasedFBMaster fcsbb = new FrequencyCtxWindowBasedFBMaster(searcher, properties, null, 5, 0); FrequencyCtxBased fcsb = (FrequencyCtxBased) fcsbb.build(); FrequencyCtxBased ref_fcsb = (FrequencyCtxBased) (new FrequencyCtxWindowBasedFBMaster(searcher, properties, fcsb.getMapCtx2TTF().keySet(), 5, 0).build());*/ CooccurrenceFBMaster cob = new CooccurrenceFBMaster(searcher, properties, ft, this.prefilterMinTTF, fcs, ref_fcs, this.prefilterMinTCF); Cooccurrence co = (Cooccurrence) cob.build(); //feature expected probability for frequent terms ChiSquareFrequentTermsFBMaster cf = new ChiSquareFrequentTermsFBMaster( ref_fcs.getMapCtx2TTF(), ref_fcs.getTerm2Ctx(), ft.getCorpusTotal(), properties); ChiSquareFrequentTerms cff = (ChiSquareFrequentTerms) cf.build(); ChiSquare chi = new ChiSquare(); chi.registerFeature(FrequencyCtxBased.class.getName() + ChiSquare.SUFFIX_TERM, fcs); chi.registerFeature(Cooccurrence.class.getName(), co); chi.registerFeature(ChiSquareFrequentTerms.class.getName(), cff); log.info("start to run chisquare ..."); List<JATETerm> terms = chi.execute(co.getTerms()); log.info("complete chisquare statistics for all terms."); log.info("post-filtering terms ..."); terms = cutoff(terms); log.info("complete postfiltering of terms."); addAdditionalTermInfo(terms, searcher, properties.getSolrFieldNameJATENGramInfo(), properties.getSolrFieldNameID()); return terms; // } finally { // try { // searcher.close(); // } catch (IOException e) { // log.error(e.toString()); // } // } } }