java source code of CloudSpeechSession

/*
 * Copyright 2019 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// The CloudSpeechSession streams audio to the server until the endpointer tells it to stop. It
// therefore must be repeatedly reopened for continuous transcription. The response observer gets
// data back from the server. Our CloudSpeechStreamObserver extracts the speech and the confidence
// and passes the data to a SpeechSessionListener, which helps to aggregate
// TranscriptionResults and manage the repeatedly reopening sessions.

package com.google.audio.asr.cloud;

import com.google.audio.StreamingAudioEncoder;
import com.google.audio.asr.CloudSpeechSessionParams;
import com.google.audio.asr.SpeechRecognitionModelOptions;
import com.google.audio.asr.SpeechSession;
import com.google.audio.asr.SpeechSessionListener;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.SpeechContext;
import com.google.cloud.speech.v1p1beta1.SpeechGrpc;
import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig;
import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest;
import com.google.common.flogger.FluentLogger;
import com.google.protobuf.ByteString;
import io.grpc.ManagedChannel;
import io.grpc.stub.StreamObserver;
import java.io.IOException;
import org.joda.time.Duration;

/**
 * Lightweight wrapper around the GRPC Google Cloud Speech API. It can handle one streaming
 * recognition request at a time.
 */
public class CloudSpeechSession extends SpeechSession {
  private static final FluentLogger logger = FluentLogger.forEnclosingClass();
  private SpeechGrpc.SpeechStub speechClient;

  // Since the speech session times out after 5 minutes, we should try to avoid sessions reaching
  // approximately that length. If speech begins 4 mins and 30 seconds into the transcription, it
  // will likely be cut off. Therefore, we close sessions that haven't received any results in an
  // extended period of time.
  public static final Duration CLOSE_SESSION_AFTER_EXTENDED_SILENCE_DURATION =
      Duration.standardMinutes(4).plus(Duration.standardSeconds(30));

  /**
   * The stream observer and cloud interaction functions are marked protected so that they may be
   * replaced with a local server simulation in testing.
   */
  private CloudSpeechStreamObserver responseObserver;

  private StreamObserver<StreamingRecognizeRequest> requestObserver;

  private final CloudSpeechSessionParams params;
  private final SpeechSessionListener speechSessionListener;
  private int sampleRateHz;

  private boolean stillSendingRequests = false;
  private final ManagedChannel channel;

  private StreamingAudioEncoder encoder;
  private boolean encoderIsRequested;
  private boolean encoderIsSupported;

  /*
   * @param speechSessionListener Listener for recognition responses.
   * @param sampleRateHz Sample rate of microphone data.
   * @param channel The channel to use for cloud communication.
   */
  public CloudSpeechSession(
      CloudSpeechSessionParams params,
      SpeechSessionListener speechSessionListener,
      int sampleRateHz,
      ManagedChannel channel) {
    this.params = params;
    this.channel = channel;
    this.speechSessionListener = speechSessionListener;
    this.sampleRateHz = sampleRateHz;
    this.encoder = new StreamingAudioEncoder();
  }

  /** Starts a streaming speech recognition request. */
  @Override
  public synchronized void initImpl(
      SpeechRecognitionModelOptions modelOptions, int chunkSizeSamples) {
    if (chunkSizeSamples < 0.050 * sampleRateHz) {
      logger.atWarning().log(
          "Your buffer size is less than 50ms, you may have poor performance getting "
              + "streaming results.");
    }
    responseObserver = makeResponseObserver(speechSessionListener);
    encoderIsRequested = params.getEncoderParams().getEnableEncoder();
    encoderIsSupported =
        StreamingAudioEncoder.isEncoderSupported(params.getEncoderParams().getCodec());
    if (usingEncoder()) {
      try {
        encoder.init(
            sampleRateHz,
            params.getEncoderParams().getCodec(),
            params.getEncoderParams().getAllowVbr());
      } catch (StreamingAudioEncoder.EncoderException | IOException e) {
        e.printStackTrace();
        logger.atSevere().log("Encoder could not be created. Using uncompressed audio.");
        encoderIsRequested = false;
      }
    }
    initServer(modelOptions);

    stillSendingRequests = true;
  }

  /** Returns true when the encoder is being used. */
  public boolean usingEncoder() {
    return encoderIsRequested && encoderIsSupported;
  }

  private CloudSpeechStreamObserver makeResponseObserver(
      SpeechSessionListener speechSessionListener) {
    return new CloudSpeechStreamObserver(
        params.getObserverParams(), speechSessionListener, sessionID());
  }

  /**
   * Sends an audio buffer to the Cloud Speech Server.
   *
   * @param buffer 16 bit LinearPCM byte array.
   * @param offset first element of buffer to use.
   * @param count number of elements of buffer to use.
   * @return true if audio data was processed, false if session was already requested to close. You
   *     should wait for the recognition listener passed into the constructor to receive
   *     OK_TO_TERMINATE before destroying the session.
   */
  @Override
  public synchronized boolean processAudioBytesImpl(byte[] buffer, int offset, int count) {
    if (!isStillSendingRequests()) {
      return false;
    }

    if (usingEncoder()) {
      byte[] encoded = encoder.processAudioBytes(buffer, offset, count);
      if (encoded.length > 0) {
        streamToServer(encoded, 0, encoded.length);
      }
    } else {
      streamToServer(buffer, offset, count);
    }

    if (CLOSE_SESSION_AFTER_EXTENDED_SILENCE_DURATION.isShorterThan(
        responseObserver.timeSinceLastServerActivity())) {
      logger.atInfo().log(
          "Session #%d scheduled to be ended due to extended silence.", sessionID());
      requestCloseSession();
    }
    return true;
  }

  private boolean isStillSendingRequests() {
    return stillSendingRequests && responseObserver.isStillListening();
  }

  /**
   * Closes the current recognition request on the client end. This does not immediately end the
   * session. Only once the server acknowledges the closing of the session is communication
   * complete.
   */
  @Override
  public synchronized void requestCloseSessionImpl() {
    if (stillSendingRequests) {
      stillSendingRequests = false;
      if (usingEncoder()) {
        // Get any remaining output from the codec and stop.
        byte[] data = encoder.flushAndStop();
        streamToServer(data, 0, data.length);
      }
      closeServer();
    }
  }

  @Override
  public boolean requiresNetworkConnection() {
    return true;
  }

  private void initServer(SpeechRecognitionModelOptions modelOptions) {
    this.speechClient = SpeechGrpc.newStub(channel);
    requestObserver = speechClient.streamingRecognize(responseObserver);

    // Build and send a StreamingRecognizeRequest containing the parameters for
    // processing the audio.
    SpeechContext speechContext = SpeechContext.getDefaultInstance();

    RecognitionConfig.AudioEncoding encodingType = RecognitionConfig.AudioEncoding.LINEAR16;
    if (usingEncoder()) {
      switch (encoder.getCodecType()) {
        case AMRWB:
          encodingType = RecognitionConfig.AudioEncoding.AMR_WB;
          break;
        case FLAC:
          encodingType = RecognitionConfig.AudioEncoding.FLAC;
          break;
        case OGG_OPUS:
          encodingType = RecognitionConfig.AudioEncoding.OGG_OPUS;
          break;
        default:
      }
    }
    RecognitionConfig.Builder configBuilder =
        RecognitionConfig.newBuilder()
            .setEncoding(encodingType)
            .setSampleRateHertz(sampleRateHz)
            .setAudioChannelCount(1)
            .setEnableAutomaticPunctuation(true)
            .setEnableWordConfidence(true)
            .setEnableWordTimeOffsets(true)
            .addSpeechContexts(speechContext)
            .setLanguageCode(modelOptions.getLocale())
            .setProfanityFilter(params.getFilterProfanity())
            .addSpeechContexts(
                SpeechContext.newBuilder()
                    .addAllPhrases(modelOptions.getBiasWordsList()));

    StreamingRecognitionConfig.Builder strbuilder =
        StreamingRecognitionConfig.newBuilder()
            .setInterimResults(true)
            .setSingleUtterance(false);
    switch (modelOptions.getModel()) {
      case VIDEO:
        if (!modelOptions.getLocale().equals("en-US")) {
          logger.atSevere().log("Only en-US is supported by YouTube Livestream model");
        }
        configBuilder.setModel("video");
        break;
      case DICTATION_DEFAULT:
        configBuilder.setModel("default");
        break;
    }

    RecognitionConfig config = configBuilder.build();
    StreamingRecognitionConfig streamingConfig = strbuilder.setConfig(config).build();

    // First request sends the configuration.
    StreamingRecognizeRequest initial =
        StreamingRecognizeRequest.newBuilder().setStreamingConfig(streamingConfig).build();

    requestObserver.onNext(initial);
  }

  private void streamToServer(byte[] buffer, int offset, int count) {
    StreamingRecognizeRequest request =
        StreamingRecognizeRequest.newBuilder()
            .setAudioContent(ByteString.copyFrom(buffer, offset, count))
            .build();
    requestObserver.onNext(request);
  }

  private void closeServer() {
    if (requestObserver != null) {
      // Tell the server we're done sending.
      requestObserver.onCompleted();
      requestObserver = null;
    }
  }
}