package com.bazaarvoice.emodb.web.resources.databus; import com.bazaarvoice.emodb.auth.jersey.Subject; import com.bazaarvoice.emodb.databus.api.PollResult; import com.codahale.metrics.Gauge; import com.codahale.metrics.Histogram; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.Timer; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Optional; import com.google.common.base.Throwables; import com.google.common.collect.Iterators; import com.google.inject.Inject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.servlet.AsyncContext; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import javax.ws.rs.core.Response; import java.io.IOException; import java.time.Duration; import java.time.Instant; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import static com.google.common.base.Preconditions.checkNotNull; /** * Helper class for asynchronously polling databus resource requests. */ public class DatabusResourcePoller { private static final Logger _log = LoggerFactory.getLogger(DatabusResourcePoller.class); public static final int DEFAULT_NUM_KEEP_ALIVE_THREADS = 4; public static final int DEFAULT_NUM_POLLING_THREADS = 8; private static final Duration MAX_LONG_POLL_TIME = Duration.ofSeconds(20); private static final Duration LONG_POLL_RETRY_TIME = Duration.ofSeconds(2); private static final Duration LONG_POLL_SEND_REFRESH_TIME = Duration.ofMillis(200); private static final Duration KEEP_ALIVE_SAFETY_BUFFER_TIME = Duration.ofSeconds(10); private static final String POLL_DATABUS_EMPTY_HEADER = "X-BV-Databus-Empty"; private final Timer _pollTimer; private final ScheduledExecutorService _keepAliveExecutorService; private final ScheduledExecutorService _pollingExecutorService; private final Histogram _keepAliveThreadDelayHistogram; private final Histogram _pollingThreadDelayHistogram; @Inject public DatabusResourcePoller(Optional<LongPollingExecutorServices> longPollingExecutorServices, MetricRegistry metricRegistry) { checkNotNull(longPollingExecutorServices, "longPollingExecutorServices"); if (longPollingExecutorServices.isPresent()) { _keepAliveExecutorService = longPollingExecutorServices.get().getKeepAlive(); _pollingExecutorService = longPollingExecutorServices.get().getPoller(); addThreadPoolMonitoring(_keepAliveExecutorService, "inactiveKeepAliveThreads", metricRegistry); addThreadPoolMonitoring(_pollingExecutorService, "inactivePollingThreads", metricRegistry); } else { _keepAliveExecutorService = null; _pollingExecutorService = null; } _keepAliveThreadDelayHistogram = metricRegistry.histogram(MetricRegistry.name("bv.emodb.databus", "DatabusResource1", "keepAliveThreadDelay")); _pollingThreadDelayHistogram = metricRegistry.histogram(MetricRegistry.name("bv.emodb.databus", "DatabusResource1", "pollingThreadDelay")); _pollTimer = buildPollTimer(metricRegistry); } /** * Unit test constructor which does not do long polling. This is because DropWizard ResourceTest classes * do not support asynchronous requests. */ @VisibleForTesting public DatabusResourcePoller(MetricRegistry metricRegistry) { _pollTimer = buildPollTimer(metricRegistry); _keepAliveExecutorService = null; _pollingExecutorService = null; _keepAliveThreadDelayHistogram = null; _pollingThreadDelayHistogram = null; } // Runnable to intermittently poll for data and output to the response private class DatabusPollRunnable implements Runnable { private volatile boolean _pollingActive = true; private final AsyncContext _asyncContext; private KeepAliveRunnable _keepAliveRunnable; private Subject _subject; private SubjectDatabus _databus; private Duration _claimTtl; private int _limit; private String _subscription; private PeekOrPollResponseHelper _helper; private long _longPollStopTime; private Timer.Context _timerContext; private long _lastRunTime = 0; DatabusPollRunnable(AsyncContext asyncContext, KeepAliveRunnable keepAliveRunnable, Subject subject, SubjectDatabus databus, Duration claimTtl, int limit, String subscription, PeekOrPollResponseHelper helper, long longPollStopTime, Timer.Context timerContext) { _asyncContext = asyncContext; _keepAliveRunnable = keepAliveRunnable; _subject = subject; _databus = databus; _claimTtl = claimTtl; _limit = limit; _subscription = subscription; _helper = helper; _longPollStopTime = longPollStopTime; _timerContext = timerContext; } @Override public void run() { boolean rescheduled = false; try { if (_lastRunTime > 0) { // Record any delay between when we *expected* to run and when we actually ran. This should help // detect overloaded thread pools which, in turn, may lead to timeouts. _pollingThreadDelayHistogram.update(System.currentTimeMillis() - _lastRunTime - LONG_POLL_RETRY_TIME.toMillis()); } if (_pollingActive) { boolean pollFailed = false; PollResult result; try { // Issue a polling call with our server-side client. This ensures that we execute a synchronous // call on the other end and receive a quick response - we do NOT want to execute a long-poll here // and spend up to 20 seconds waiting for a response (occupying this thread all the while). result = _databus.poll(_subject, _subscription, _claimTtl, _limit); } catch (Exception e) { // We're in an async context and have already returned a 200 response. Since we can't // retroactively change to 500 finish the request with an empty response and log the error. _log.error("Failed to perform asynchronous poll on subscription {}", _subscription, e); result = new PollResult(Iterators.emptyIterator(), 0, false); pollFailed = true; } // Go ahead and output the response if we either 1.) find events to output, 2.) exceed our time // limit, or 3.) received an exception during the last poll if (result.getEventIterator().hasNext() || (System.currentTimeMillis() + LONG_POLL_RETRY_TIME.toMillis()) >= _longPollStopTime || pollFailed) { // Lock the context before writing the response to ensure that the KeepAliveRunnable doesn't // insert bad data into the middle of it synchronized (_asyncContext) { if (_pollingActive) { _keepAliveRunnable.cancelKeepAlive(); populateResponse(result, (HttpServletResponse) _asyncContext.getResponse(), _helper); } } } else { // Nothing to output; schedule the job to check again. If the result had more events then poll // again immediately, otherwise wait a few seconds. _lastRunTime = System.currentTimeMillis(); if (result.hasMoreEvents()) { _pollingExecutorService.execute(this); } else { _pollingExecutorService.schedule(this, LONG_POLL_RETRY_TIME.toMillis(), TimeUnit.MILLISECONDS); } rescheduled = true; } } } finally { // Stop the timer if we didn't reschedule the job (either because it completed or an exception was thrown) if (!rescheduled) { _timerContext.stop(); synchronized (_asyncContext) { _asyncContext.complete(); } } } } public void cancelPolling() { _pollingActive = false; } } // Runnable to provide a steady stream of whitespace data to the client to keep our connection alive private class KeepAliveRunnable implements Runnable { private final AsyncContext _asyncContext; private volatile boolean _keepAliveRequired = true; private Instant _keepAliveStopTime; private DatabusPollRunnable _databusPollRunnable; private long _lastRunTime = 0; public KeepAliveRunnable(AsyncContext asyncContext, long longPollStopTime) { _asyncContext = asyncContext; // Allow the keep-alive to run a little longer than the anticipated long-poll cutoff just to be sure that we // don't disconnect prematurely in cases where the poll() call takes longer than expected. _keepAliveStopTime = Instant.ofEpochMilli(longPollStopTime).plus(KEEP_ALIVE_SAFETY_BUFFER_TIME); } public void setDatabusPollRunnable(DatabusPollRunnable databusPollRunnable) { _databusPollRunnable = databusPollRunnable; } public void cancelKeepAlive() { _keepAliveRequired = false; } @Override public void run() { if (_lastRunTime > 0) { // Record any delay between when we *expected* to run and when we actually ran. This should help // detect overloaded thread pools which, in turn, may lead to timeouts. _keepAliveThreadDelayHistogram.update(System.currentTimeMillis() - _lastRunTime - LONG_POLL_SEND_REFRESH_TIME.toMillis()); } // Lock the context before writing to the response to ensure that we don't insert data into it while the // DatabusPollRunnable is also writing synchronized (_asyncContext) { // As long as the keep alive signal is required, send a refresh to the client and schedule another attempt // (note that we also cancel out if a certain length of time is exceeded - normally we would rely on the // DatabusPollRunnable to kill this Runnable, but this timer serves as a safety blanket in case that job // quits unexpectedly) if (_keepAliveRequired && _keepAliveStopTime.isAfter(Instant.now())) { try { sendClientRefresh((HttpServletResponse) _asyncContext.getResponse()); _lastRunTime = System.currentTimeMillis(); _keepAliveExecutorService.schedule(this, LONG_POLL_SEND_REFRESH_TIME.toMillis(), TimeUnit.MILLISECONDS); } catch (Exception ex) { _log.error("Failed to send a keep-alive message to the client"); // This exception most likely came from a failure to write the response, meaning that this // request is dead in the water - go ahead and cancel our polling runnable right away so that it // won't issue any poll() requests that might claim data that the client will never receive // (that isn't the end of the world if it happens, but WILL delay the receipt of those messages // in subsequent requests). if (_databusPollRunnable != null) { _databusPollRunnable.cancelPolling(); } } } } } } private static void populateResponse(PollResult result, HttpServletResponse response, PeekOrPollResponseHelper helper) { try { helper.getJson().writeJson(response.getOutputStream(), result.getEventIterator()); } catch (IOException ex) { _log.error("Failed to write response to the client"); } } private static void sendClientRefresh(HttpServletResponse response) throws IOException { // Only supply whitespace since this won't affect how our results are parsed (consumers should just filter it // out, even though it's sufficient to keep our connection alive) response.getOutputStream().print(' '); response.flushBuffer(); } public Response poll(Subject subject, SubjectDatabus databus, String subscription, Duration claimTtl, int limit, HttpServletRequest request, boolean ignoreLongPoll, PeekOrPollResponseHelper helper) { Timer.Context timerContext = _pollTimer.time(); boolean synchronousResponse = true; Response response; try { // Calculate when the request should internally time out (do this before our first poll() request because we // want that to count toward our total run-time) long longPollStopTime = System.currentTimeMillis() + MAX_LONG_POLL_TIME.toMillis(); // Always issue a synchronous request at the start . . this will allow us to bypass our thread pool logic // altogether in cases where we might return the value immediately (see more below). There is a danger here // that the thread will stall if "databus" is an instance of DatabusClient and we are stuck waiting for a // response - however, since we use the server-side client we know that it will always execute synchronously // itself (no long-polling) and return in a reasonable period of time. PollResult result = databus.poll(subject, subscription, claimTtl, limit); if (ignoreLongPoll || result.getEventIterator().hasNext() || _keepAliveExecutorService == null || _pollingExecutorService == null) { // If ignoreLongPoll == true or we have no executor services to schedule long-polling on then always // return a response, even if it's empty. Alternatively, if we have data to return - return it! response = Response.ok() .header(POLL_DATABUS_EMPTY_HEADER, String.valueOf(!result.hasMoreEvents())) .entity(helper.asEntity(result.getEventIterator())) .build(); } else { // If the response is empty then go into async-mode and start up the runnables for our long-polling. response = scheduleLongPollingRunnables(request, longPollStopTime, subject, databus, claimTtl, limit, subscription, result.hasMoreEvents(), helper, timerContext); synchronousResponse = false; } } finally { // Stop our timer if our request is finished here . . otherwise we are in async mode and it is the // responsibility of DatabusPollRunnable to close it out. if (synchronousResponse) { timerContext.stop(); } } return response; } private Response scheduleLongPollingRunnables(HttpServletRequest request, long longPollStopTime, Subject subject, SubjectDatabus databus, Duration claimTtl, int limit, String subscription, boolean firstPollHadMoreEvents, PeekOrPollResponseHelper helper, Timer.Context timerContext) { final AsyncContext ctx = request.startAsync(); boolean jobsScheduled = false; try { ctx.getResponse().setContentType("application/json"); // We can't know at this point if the queue will be fully drained or not, so base the "databus empty" // header on whether the original empty poll had more results. Worst case the caller has a single false empty // header which gets set correctly on the next poll. ((HttpServletResponse) ctx.getResponse()).setHeader(POLL_DATABUS_EMPTY_HEADER, String.valueOf(!firstPollHadMoreEvents)); // Immediately send a client refresh since we've already executed a poll() call and we don't know how long it // will take for our scheduled refresh to do its thing. It's probably not needed 99.9% of the time, but we // keep it here as a precaution. sendClientRefresh((HttpServletResponse) ctx.getResponse()); KeepAliveRunnable keepAliveRunnable = new KeepAliveRunnable(ctx, longPollStopTime); DatabusPollRunnable pollingRunnable = new DatabusPollRunnable(ctx, keepAliveRunnable, subject, databus, claimTtl, limit, subscription, helper, longPollStopTime, timerContext); keepAliveRunnable.setDatabusPollRunnable(pollingRunnable); // - We don't use ctx.start() here since Jetty's implementation will schedule the job on our request thread pool, // which completely nullifies the entire point of using an async implementation // - Kick off two recurring jobs, one to poll for events and another to keep the connection to the client alive. // Note that we start the keep-alive immediately since we've already taken time above to run an initial poll() // request; likewise, we delay before we run another poll() so that we don't run two in immediate succession. _pollingExecutorService.schedule(pollingRunnable, LONG_POLL_RETRY_TIME.toMillis(), TimeUnit.MILLISECONDS); _keepAliveExecutorService.schedule(keepAliveRunnable, 0, TimeUnit.MILLISECONDS); jobsScheduled = true; // The actual processing is asynchronous. However, whatever we return here is included in the result, so make // it empty. return Response.ok().build(); } catch (IOException ex) { throw Throwables.propagate(ex); } finally { // Make sure that we close out the request if our jobs (which would close it for us) failed to get scheduled if (!jobsScheduled) { ctx.complete(); } } } private static void addThreadPoolMonitoring(ScheduledExecutorService executorService, String metricName, MetricRegistry metricRegistry) { if (executorService instanceof ThreadPoolExecutor) { final ThreadPoolExecutor monitoredThreadPoolExecutor = (ThreadPoolExecutor)executorService; metricRegistry.register(MetricRegistry.name("bv.emodb.databus", "DatabusResource1", metricName), new Gauge<Integer>() { @Override public Integer getValue() { return monitoredThreadPoolExecutor.getPoolSize() - monitoredThreadPoolExecutor.getActiveCount(); } }); } } private static Timer buildPollTimer(MetricRegistry metricRegistry) { return metricRegistry.timer(MetricRegistry.name("bv.emodb.databus", "DatabusResource1", "poll")); } }