package com.ubervu.river.github; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonStreamParser; import org.apache.commons.codec.digest.DigestUtils; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.Client; import org.elasticsearch.common.Base64; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.index.query.FilterBuilders; import org.elasticsearch.index.query.FilteredQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.indices.IndexAlreadyExistsException; import org.elasticsearch.river.AbstractRiverComponent; import org.elasticsearch.river.River; import org.elasticsearch.river.RiverName; import org.elasticsearch.river.RiverSettings; import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortOrder; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; import static org.elasticsearch.index.query.QueryBuilders.termQuery; public class GitHubRiver extends AbstractRiverComponent implements River { private final Client client; private final String index; private final String repository; private final String owner; private final int userRequestedInterval; private final String endpoint; private String password; private String username; private DataStream dataStream; private String eventETag = null; private int pollInterval = 60; @SuppressWarnings({"unchecked"}) @Inject public GitHubRiver(RiverName riverName, RiverSettings settings, Client client) { super(riverName, settings); this.client = client; if (!settings.settings().containsKey("github")) { throw new IllegalArgumentException("Need river settings - owner and repository."); } // get settings Map<String, Object> githubSettings = (Map<String, Object>) settings.settings().get("github"); owner = XContentMapValues.nodeStringValue(githubSettings.get("owner"), null); repository = XContentMapValues.nodeStringValue(githubSettings.get("repository"), null); index = String.format("%s&%s", owner, repository); userRequestedInterval = XContentMapValues.nodeIntegerValue(githubSettings.get("interval"), 60); // auth (optional) username = null; password = null; if (githubSettings.containsKey("authentication")) { Map<String, Object> auth = (Map<String, Object>) githubSettings.get("authentication"); username = XContentMapValues.nodeStringValue(auth.get("username"), null); password = XContentMapValues.nodeStringValue(auth.get("password"), null); } // endpoint (optional - default to github.com) endpoint = XContentMapValues.nodeStringValue(githubSettings.get("endpoint"), "https://api.github.com"); logger.info("Created GitHub river."); } @Override public void start() { // create the index explicitly so we can use the whitespace tokenizer // because there are usernames like "user-name" and we want those // to be treated as just one term try { Settings indexSettings = ImmutableSettings.settingsBuilder().put("analysis.analyzer.default.tokenizer", "whitespace").build(); client.admin().indices().prepareCreate(index).setSettings(indexSettings).execute().actionGet(); logger.info("Created index."); } catch (IndexAlreadyExistsException e) { logger.info("Index already created"); } catch (Exception e) { logger.error("Exception creating index.", e); } dataStream = new DataStream(); dataStream.start(); logger.info("Started GitHub river."); } @Override public void close() { dataStream.setRunning(false); dataStream.interrupt(); logger.info("Stopped GitHub river."); } private class DataStream extends Thread { private volatile boolean isRunning; @Inject public DataStream() { super("DataStream thread"); isRunning = true; } private boolean checkAndUpdateETag(HttpURLConnection conn) throws IOException { if (eventETag != null) { conn.setRequestProperty("If-None-Match", eventETag); } String xPollInterval = conn.getHeaderField("X-Poll-Interval"); if (xPollInterval != null) { logger.debug("Next GitHub specified minimum polling interval is {} s", xPollInterval); pollInterval = Integer.parseInt(xPollInterval); } if (conn.getResponseCode() == 304) { logger.debug("304 {}", conn.getResponseMessage()); return false; } String eTag = conn.getHeaderField("ETag"); if (eTag != null) { logger.debug("New eTag: {}", eTag); eventETag = eTag; } return true; } private boolean indexResponse(HttpURLConnection conn, String type) { InputStream input; try { input = conn.getInputStream(); } catch (IOException e) { logger.info("Exception encountered (403 usually is rate limit exceeded): ", e); return false; } JsonStreamParser jsp = new JsonStreamParser(new InputStreamReader(input)); JsonArray array = (JsonArray) jsp.next(); BulkProcessor bp = BulkProcessor.builder(client, new BulkProcessor.Listener() { @Override public void beforeBulk(long executionId, BulkRequest request) { } @Override public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { } @Override public void afterBulk(long executionId, BulkRequest request, Throwable failure) { } }).build(); boolean continueIndexing = true; IndexRequest req = null; for (JsonElement e: array) { if (type.equals("event")) { req = indexEvent(e); if (req == null) { continueIndexing = false; logger.debug("Found existing event, all remaining events has already been indexed"); break; } } else if (type.equals("issue")) { req = indexOther(e, "IssueData", true); } else if (type.equals("pullreq")) { req = indexOther(e, "PullRequestData"); } else if (type.equals("milestone")) { req = indexOther(e, "MilestoneData"); } else if (type.equals("label")) { req = indexOther(e, "LabelData"); } else if (type.equals("collaborator")) { req = indexOther(e, "CollaboratorData"); } bp.add(req); } bp.close(); try { input.close(); } catch (IOException e) { logger.warn("Couldn't close connection?", e); } return continueIndexing; } private boolean isEventIndexed(String id) { return client.prepareGet(index, null, id).get().isExists(); } private IndexRequest indexEvent(JsonElement e) { JsonObject obj = e.getAsJsonObject(); String type = obj.get("type").getAsString(); String id = obj.get("id").getAsString(); if (isEventIndexed(id)) { return null; } IndexRequest req = new IndexRequest(index) .type(type) .id(id).create(false) // we want to overwrite old items .source(e.toString()); return req; } private IndexRequest indexOther(JsonElement e, String type, boolean overwrite) { JsonObject obj = e.getAsJsonObject(); // handle objects that don't have IDs (i.e. labels) // set the ID to the MD5 hash of the string representation String id; if (obj.has("id")) { id = obj.get("id").getAsString(); } else { id = DigestUtils.md5Hex(e.toString()); } IndexRequest req = new IndexRequest(index) .type(type) .id(id).create(!overwrite) .source(e.toString()); return req; } private IndexRequest indexOther(JsonElement e, String type) { return indexOther(e, type, false); } private HashMap<String, String> parseHeader(String header) { // inspired from https://github.com/uberVU/elasticboard/blob/4ccdfd8c8e772c1dda49a29a7487d14b8d820762/data_processor/github.py#L73 Pattern p = Pattern.compile("\\<([a-z/0-9:\\.\\?_&=]+page=([0-9]+))\\>;\\s*rel=\\\"([a-z]+)\\\".*"); Matcher m = p.matcher(header); if (!m.matches()) { return null; } HashMap<String, String> data = new HashMap<String, String>(); data.put("url", m.group(1)); data.put("page", m.group(2)); data.put("rel", m.group(3)); return data; } private boolean morePagesAvailable(URLConnection response) { String link = response.getHeaderField("link"); if (link == null || link.length() == 0) { return false; } HashMap<String, String> headerData = parseHeader(response.getHeaderField("link")); if (headerData == null) { return false; } String rel = headerData.get("rel"); return rel.equals("next"); } private String nextPageURL(URLConnection response) { HashMap<String, String> headerData = parseHeader(response.getHeaderField("link")); if (headerData == null) { return null; } return headerData.get("url"); } private void addAuthHeader(URLConnection connection) { if (username == null || password == null) { return; } String auth = String.format("%s:%s", username, password); String encoded = Base64.encodeBytes(auth.getBytes()); connection.setRequestProperty("Authorization", "Basic " + encoded); } private boolean getData(String fmt, String type) { return getData(fmt, type, null); } private boolean getData(String fmt, String type, String since) { try { URL url; if (since != null) { url = new URL(String.format(fmt, owner, repository, since)); } else { url = new URL(String.format(fmt, owner, repository)); } HttpURLConnection connection = (HttpURLConnection) url.openConnection(); addAuthHeader(connection); if (type.equals("event")) { boolean modified = checkAndUpdateETag(connection); if (!modified) { return false; } } boolean continueIndexing = indexResponse(connection, type); while (continueIndexing && morePagesAvailable(connection)) { url = new URL(nextPageURL(connection)); connection = (HttpURLConnection) url.openConnection(); addAuthHeader(connection); continueIndexing = indexResponse(connection, type); } } catch (Exception e) { logger.error("Exception in getData", e); } return true; } private void deleteByType(String type) { client.prepareDeleteByQuery(index) .setQuery(termQuery("_type", type)) .execute() .actionGet(); } /** * Gets the creation data of the single newest entry. * * @return ISO8601 formatted time of most recent entry, or null on empty or error. */ private String getMostRecentEntry() { long totalEntries = client.prepareCount(index).setQuery(matchAllQuery()).execute().actionGet().getCount(); if (totalEntries > 0) { FilteredQueryBuilder updatedAtQuery = QueryBuilders .filteredQuery(QueryBuilders.matchAllQuery(), FilterBuilders.existsFilter("created_at")); FieldSortBuilder updatedAtSort = SortBuilders.fieldSort("created_at").order(SortOrder.DESC); SearchResponse response = client.prepareSearch(index) .setQuery(updatedAtQuery) .addSort(updatedAtSort) .setSize(1) .execute() .actionGet(); String createdAt = (String) response.getHits().getAt(0).getSource().get("created_at"); logger.debug("Most recent event was created at {}", createdAt); return createdAt; } else { // getData will get all data on a null. logger.info("No existing entries, assuming first run"); return null; } } @Override public void run() { while (isRunning) { // Must be read before getting new events. String mostRecentEntry = getMostRecentEntry(); logger.debug("Checking for events"); if (getData(endpoint + "/repos/%s/%s/events?per_page=1000", "event")) { logger.debug("First run or new events found, fetching rest of the data"); if (mostRecentEntry != null) { getData(endpoint + "/repos/%s/%s/issues?state=all&per_page=1000&since=%s", "issue", mostRecentEntry); } else { getData(endpoint + "/repos/%s/%s/issues?state=all&per_page=1000", "issue"); } // delete pull req data - we are only storing open pull reqs // and when a pull request is closed we have no way of knowing; // this is why we have to delete them and reindex "fresh" ones deleteByType("PullRequestData"); getData(endpoint + "/repos/%s/%s/pulls", "pullreq"); // same for milestones deleteByType("MilestoneData"); getData(endpoint + "/repos/%s/%s/milestones?per_page=1000", "milestone"); // collaborators deleteByType("CollaboratorData"); getData(endpoint + "/repos/%s/%s/collaborators?per_page=1000", "collaborator"); // and for labels - they have IDs based on the MD5 of the contents, so // if a property changes, we get a "new" document deleteByType("LabelData"); getData(endpoint + "/repos/%s/%s/labels?per_page=1000", "label"); } else { logger.debug("No new events found"); } try { int waitTime = Math.max(pollInterval, userRequestedInterval) * 1000; logger.debug("Waiting {} ms before polling for new events", waitTime); Thread.sleep(waitTime); // needs milliseconds } catch (InterruptedException e) { logger.info("Wait interrupted, river was probably stopped"); } } } public void setRunning(boolean running) { isRunning = running; } } }