/************************************************************************** * Developed by Language Technologies Institute, Carnegie Mellon University * Written by Richard Wang (rcwang#cs,cmu,edu) **************************************************************************/ package com.rcwang.seal.fetch; import java.io.File; import java.util.Set; import org.apache.log4j.Logger; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import com.rcwang.seal.expand.LangProvider; import com.rcwang.seal.util.Helper; public class GoogleAPISearcher extends WebSearcher { /********************** Google AJAX Parameters ********************/ public static final String BASE_URL = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&safe=off&"; public static final String RESULTS_KEY = "rsz"; public static final String RESULTS_SMALL = "small"; public static final String RESULTS_LARGE = "large"; public static final String START_KEY = "start"; public static final String LANG_KEY = "lr"; public static final String QUERY_KEY = "q"; public static final String GOOGLE_LANG_PREFIX = "lang_"; public static final int MAX_RESULTS_PER_PAGE = 8; public static final int MAX_PAGES = 8; /******************************************************************/ public static final String HOST = Helper.toURL(BASE_URL).getHost(); public static Logger log = Logger.getLogger(GoogleAPISearcher.class); public static void main(String args[]) { int numResults = 64; GoogleAPISearcher gs = new GoogleAPISearcher(); gs.setLangID("en"); gs.setCacheDir(new File("/www.cache/")); gs.setNumResults(numResults); gs.setTimeOutInMS(10*1000); gs.setMaxDocSizeInKB(512); gs.addQuery("(\"Richard C. Wang\" OR \"David C. Wang\")", false); gs.run(); Set<Snippet> snippets = gs.getSnippets(); for (Snippet snippet : snippets) log.info(snippet); if (numResults == snippets.size()) log.info("Test succeeded!"); else log.error("Test failed! Expecting: " + numResults + " Actual: " + snippets.size()); } public GoogleAPISearcher() { super(); setMaxResultPerPage(MAX_RESULTS_PER_PAGE); } public void setLangID(String langID) { if (langID == null || langID.equals(LangProvider.UNI[LangProvider.ID])) langCode = ""; else langCode = GOOGLE_LANG_PREFIX + langID; } protected void buildSnippets(String resultPage) { if (resultPage == null || resultPage.length() == 0) return; final String responseDataStr = "responseData"; try { JSONObject document = new JSONObject(resultPage); if (document.isNull(responseDataStr)) return; JSONObject responseData = document.getJSONObject(responseDataStr); JSONArray results = responseData.getJSONArray("results"); for (int i = 0; i < results.length(); i++) { JSONObject result = results.getJSONObject(i); String pageURL = result.getString("unescapedUrl"); String title = result.getString("titleNoFormatting"); // a valid snippet must contain page URL and title if (pageURL == null || title == null) continue; String cacheURL = result.getString("cacheUrl"); String summary = result.getString("content"); Snippet snippet = new Snippet(); snippet.setPageURL(pageURL); snippet.setTitle(title); snippet.setCacheURL(cacheURL); snippet.setSummary(summary); snippet.setRank(snippets.size()+1); snippets.add(snippet); } } catch (JSONException e) { log.error(e); } } /** * Returns a URL for Google search page * @param numResultsForThisPage number of results per page (between 1 and 100 inclusively) * @param pageNum page number (greater than 0) * @param query query terms * @return Google search page URL */ protected String getSearchURL(int numResultsForThisPage, int pageNum, String query) { if (numResultsForThisPage < 1 || numResultsForThisPage > maxResultsPerPage) throw new IllegalArgumentException("Number of results for this page must be between 1 and " + maxResultsPerPage + " inclusively."); if (pageNum < 1 || pageNum > MAX_PAGES) throw new IllegalArgumentException("Page number must be between 1 and " + MAX_PAGES + " inclusively."); int startIndex = (pageNum-1) * maxResultsPerPage; return getURL(numResultsForThisPage, startIndex, langCode, query); } public static String getURL(int numResults, int start, String langID, String query) { String resultsSize = numResults > (MAX_RESULTS_PER_PAGE/2) ? RESULTS_LARGE : RESULTS_SMALL; StringBuffer url = new StringBuffer(BASE_URL); url.append(RESULTS_KEY).append("=").append(resultsSize).append("&"); url.append(START_KEY).append("=").append(start).append("&"); url.append(LANG_KEY).append("=").append(langID).append("&"); url.append(QUERY_KEY).append("=").append(query); return url.toString(); } }