java source code of TranscriptionResultFormatter

/*
 * Copyright 2019 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.audio.asr;

import android.graphics.Color;
import android.graphics.Typeface;
import android.text.Spannable;
import android.text.SpannableString;
import android.text.SpannableStringBuilder;
import android.text.Spanned;
import android.text.SpannedString;
import android.text.TextUtils;
import android.text.style.ForegroundColorSpan;
import android.text.style.StyleSpan;
import com.google.audio.asr.TranscriptionResultFormatterOptions.SpeakerIndicationStyle;
import com.google.audio.asr.TranscriptionResultFormatterOptions.TextColormap;
import com.google.audio.asr.TranscriptionResultFormatterOptions.TranscriptColoringStyle;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.List;
import org.joda.time.Duration;

/**
 * Creates a colored transcript in the format of {@link SpannedString} from {@link
 * TranscriptionResult} according to the configuration of {@link Options}.
 *
 * <p>This class is not thread-safe. If you intend to use this from multiple threads, consider
 * SafeTranscriptionResultFormatter.
 */
public class TranscriptionResultFormatter {
  private static final String WHITE = "#ffffffff"; // Alpha: 1
  private static final String BLACK = "#de000000"; // Alpha: .87
  // Color gradients can be generated using http://www.perbang.dk/rgbgradient/.
  // In order of ascending confidence.
  private static final ImmutableList<String> LIGHT_THEME_COLORS =
      ImmutableList.of("#004ffa", "#1b55c8", "#375b96", "#526164", "#6e6732", "#8a6e00");
  private static final ImmutableList<String> DARK_THEME_COLORS =
      ImmutableList.of("#004ffa", "#306dc8", "#608c69", "#90aa64", "#c0c932", "#ffff00");
  private static final ImmutableList<String> SPEAKER_ID_COLORS =
      ImmutableList.of(
          "#4285f4", // blue
          "#ea4335", // red
          "#fbbc04", // yellow
          "#34a853", // green
          "#FA7B17", // orange
          "#F439A0", // pink
          "#A142F4", // purple
          "#24C1E0" // cyan
      );

  private static final ImmutableList<Double> UPPER_CONFIDENCE_THRESHOLDS =
      ImmutableList.of(0.3, 0.55, 0.7, 0.8, 0.9, Double.POSITIVE_INFINITY);

  // The separator regex used to split a concatenated string of word values.
  private static final String JAPANESE_SPLITTER_REGEX = "\\|";

  public static TranscriptionResultFormatterOptions noFormattingOptions() {
    return TranscriptionResultFormatterOptions.newBuilder()
        .setNumExtendedSilenceLineBreaks(0)
        .setNumLanguageSwitchLineBreaks(1)
        .setItalicizeCurrentHypothesis(false)
        .setTranscriptColoringStyle(TranscriptColoringStyle.NO_COLORING)
        .setTextColormap(TextColormap.DARK_THEME)
        .build();
  }

  /** Formatted text and the TranscriptionResult that produced it. */
  private static class CachedResult {
    public Spanned text;
    public TranscriptionResult result;
    public Spanned leadingWhitespace;

    CachedResult(TranscriptionResult result, Spanned text, Spanned leadingWhitespace) {
      this.result = result;
      this.text = text;
      this.leadingWhitespace = leadingWhitespace;
    }

    CharSequence getFormattedText() {
      return TextUtils.concat(leadingWhitespace, text);
    }
  }

  private TranscriptionResultFormatterOptions options;

  private Deque<CachedResult> resultsDeque = new ArrayDeque<>();

  private TranscriptionResult currentHypothesis;

  // A stored string of whitespace to add between extended silences.
  private String silenceLineBreak;
  // A stored string of whitespace to add between extended language switch.
  private String languageSwitchLineBreak;
  // A joda.org.Duration version of the options field of the same name.
  private Duration extendedSilenceDurationForLineBreaks;

  // The index of the last speaker contained in the most recently finalized result. -1 indicates
  // that no results have been seen.
  private int lastSpeakerId = -1;

  public TranscriptionResultFormatter() {
    setOptions(noFormattingOptions());
  }

  public TranscriptionResultFormatter(TranscriptionResultFormatterOptions options) {
    setOptions(options);
    reset();
  }

  /**
   * Sets the formatter options, which may include settings of current hypotheses in italics or
   * color transcripts by confidence.
   */
  public void setOptions(TranscriptionResultFormatterOptions options) {
    this.options = options.toBuilder().build();

    lastSpeakerId = -1;
    // Prepare the whitespace string.
    silenceLineBreak = createLineBreakString(options.getNumExtendedSilenceLineBreaks());
    languageSwitchLineBreak = createLineBreakString(options.getNumLanguageSwitchLineBreaks());
    extendedSilenceDurationForLineBreaks =
        TimeUtil.convert(options.getExtendedSilenceDurationForLineBreaks());

    // Reformat the old list.
    Deque<CachedResult> oldResultsDeque = resultsDeque;
    resultsDeque = new ArrayDeque<>();
    for (CachedResult oldResult : oldResultsDeque) {
      addFinalizedResult(oldResult.result);
    }
  }

  /**
   * Creates the line break string.
   *
   * @param lineBreakCount line break count in the string.
   * @return the line break string according to the lineBreakCount.
   */
  private String createLineBreakString(int lineBreakCount) {
    return Strings.repeat("\n", lineBreakCount);
  }

  /** Reset to initial state, before any calls to addFinalizedResult() or setCurrentHypothesis(). */
  public void reset() {
    resultsDeque.clear();
    lastSpeakerId = -1;
    clearCurrentHypothesis();
  }

  /**
   * Commits a result to the final transcript.
   *
   * <p>NOTE: This does not clear the hypothesis. Users who get partial results (hypotheses) should
   * prefer calling setCurrentHypothesis(...) and then finalizeCurrentHypothesis().
   */
  public void addFinalizedResult(TranscriptionResult resultSingleUtterance) {
    String lineBreak = obtainLineBreaksFromLastFinalizedResult(resultSingleUtterance);
    resultsDeque.add(
        new CachedResult(
            resultSingleUtterance.toBuilder().build(),
            formatSingleFinalized(resultSingleUtterance, !lineBreak.isEmpty()),
            SpannedString.valueOf(lineBreak)));
    lastSpeakerId = getLastSpeakerIdTag(resultSingleUtterance);
  }

  /**
   * Removes the current hypothesis so that only the finalized results will be in the transcript.
   */
  public void clearCurrentHypothesis() {
    currentHypothesis = null;
  }

  /**
   * Commits the currently stored hypothesis to the finalized text buffer and clears the hypothesis.
   *
   * @return true if it has results to finalize, otherwise false.
   */
  public boolean finalizeCurrentHypothesis() {
    if (currentHypothesis == null) {
      return false;
    }

    addFinalizedResult(currentHypothesis);
    clearCurrentHypothesis();
    return true;
  }

  /**
   * Sets the estimate of the current text, this result is expected to change. Once it is done
   * changing, commit it, by passing it to addFinalizedResult().
   */
  public void setCurrentHypothesis(TranscriptionResult resultSingleUtterance) {
    currentHypothesis = resultSingleUtterance.toBuilder().build();
  }

  /** Returns the current finalized text with the hypothesis appended to the end. */
  public Spanned getFormattedTranscript() {
    SpannableStringBuilder builder = new SpannableStringBuilder();
    for (CachedResult timestampedAndCachedResult : resultsDeque) {
      builder.append(timestampedAndCachedResult.getFormattedText());
    }
    builder.append(getFormattedHypothesis());

    return new SpannedString(builder);
  }

  /** Returns the latest sentence from transcription result. */
  public Spanned getMostRecentTranscriptSegment() {
    SpannableStringBuilder builder = new SpannableStringBuilder();
    builder.append(getFormattedHypothesis());
    if (!TextUtils.isEmpty(builder)) {
      return new SpannedString(builder);
    }

    if (!resultsDeque.isEmpty()) {
      CachedResult timestampedAndCachedResult = resultsDeque.getLast();
      builder.append(timestampedAndCachedResult.getFormattedText());
    }

    return new SpannedString(builder);
  }

  /** Get the transcription's duration time. */
  public Duration getTranscriptDuration() {
    if (resultsDeque.isEmpty()) {
      return Duration.ZERO;
    }
    return new Duration(
        TimeUtil.toInstant(resultsDeque.peekFirst().result.getStartTimestamp()),
        TimeUtil.toInstant(resultsDeque.peekLast().result.getEndTimestamp()));
  }

  private Spannable getFormattedHypothesis() {
    if (currentHypothesis == null) {
      return new SpannableString("");
    }

    SpannableStringBuilder spannableStringBuilder = new SpannableStringBuilder();
    String lineBreak = obtainLineBreaksFromLastFinalizedResult(currentHypothesis);
    boolean precededByLineBreak = !lineBreak.isEmpty();
    if (precededByLineBreak) {
      spannableStringBuilder.append(lineBreak);
    }
    spannableStringBuilder.append(formatHypothesis(currentHypothesis, precededByLineBreak));

    return SpannableString.valueOf(spannableStringBuilder);
  }

  private Spannable formatHypothesis(TranscriptionResult result, boolean precededByLineBreak) {
    Spannable spannable = formatSingleFinalized(result, precededByLineBreak);
    if (options.getItalicizeCurrentHypothesis()) {
      spannable.setSpan(
          new StyleSpan(Typeface.ITALIC),
          0,
          spannable.length(),
          Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
    }
    return spannable;
  }

  /** A function that maps a Word to a six digit hex color (e.g. #a0b341). */
  private interface ColorByWordFunction {
    String getColor(TranscriptionResult.Word w);
  }

  /**
   * Format a single result. precededByLineBreak is used to determine if a speaker indicator should
   * be added to reestablish context after a newline.
   */
  private Spannable formatSingleFinalized(
      TranscriptionResult result, boolean precededByLineBreak) {
    // Trim leading spaces, but ensure that there will be a space before the next word.
    String normalizedTranscript = result.getText().trim() + " ";
    if (result.getWordLevelDetailList().isEmpty()) {
      // Process the transcript as a whole.
      String color = "";
      switch (options.getTranscriptColoringStyle()) {
        case COLOR_BY_SPEAKER_ID:
          color = getColorFromSpeakerId(result.getSpeakerInfo().getSpeakerId());
          break;
        case COLOR_BY_UTTERANCE_LEVEL_CONFIDENCE:
          color = getColorFromConfidence(result);
          break;
        case COLOR_BY_WORD_LEVEL_CONFIDENCE:
        case NO_COLORING:
        case UNSPECIFIED_COLORING_STYLE:
          color = getDefaultColorFromTheme();
          break;
      }
      if (options.getSpeakerIndicationStyle() == SpeakerIndicationStyle.SHOW_SPEAKER_NUMBER
          && (precededByLineBreak || result.getSpeakerInfo().getSpeakerId() != lastSpeakerId)) {
        boolean requiresLineBreak = lastSpeakerId != -1 && !precededByLineBreak;
        normalizedTranscript =
            newSpeakerChevron(result.getSpeakerInfo().getSpeakerId(), requiresLineBreak)
                + normalizedTranscript;
      }
      // Make sure the utterance ends in a trailing space so that words don't get merged together.
      return makeColoredString(normalizedTranscript, color);
    } else {
      // Process each word of the transcript separately.
      ColorByWordFunction colorFunction = w -> getDefaultColorFromTheme();
      switch (options.getTranscriptColoringStyle()) {
        case COLOR_BY_WORD_LEVEL_CONFIDENCE:
          colorFunction = word -> getColorFromConfidence(word.getConfidence());
          break;
        case COLOR_BY_UTTERANCE_LEVEL_CONFIDENCE:
          colorFunction = word -> getColorFromConfidence(result); // Word-independent.
          break;
        case COLOR_BY_SPEAKER_ID:
          colorFunction = word -> getColorFromSpeakerId(word.getSpeakerInfo().getSpeakerId());
          break;
        case NO_COLORING:
        case UNSPECIFIED_COLORING_STYLE:
          colorFunction = word -> getDefaultColorFromTheme();
          break;
      }
      return addPerWordColoredStringToResult(
          normalizedTranscript,
          result.getLanguageCode(),
          result.getWordLevelDetailList(),
          precededByLineBreak,
          colorFunction);
    }
  }

  /**
   * Obtains line breaks between the last finalized result and current result. It would return an
   * empty string if no finalized transcript result existed. (Current result is he first element.)
   */
  private String obtainLineBreaksFromLastFinalizedResult(TranscriptionResult current) {
    return resultsDeque.isEmpty()
        ? ""
        : obtainLineBreaksBetweenTwoResults(resultsDeque.getLast(), current);
  }

  private String obtainLineBreaksBetweenTwoResults(
      CachedResult previous, TranscriptionResult current) {
    boolean languageSwitched = !previous.result.getLanguageCode().equals(current.getLanguageCode());
    if (options.getNumExtendedSilenceLineBreaks() > 0) { // Previous element is not whitespace.
      Duration timestampDifference =
          new Duration(
              TimeUtil.toInstant(previous.result.getEndTimestamp()),
              TimeUtil.toInstant(current.getStartTimestamp()));
      if (timestampDifference.isLongerThan(extendedSilenceDurationForLineBreaks)) {
        // If language switch and silence both happened, return the longer line break.
        return languageSwitched ? getLineBreaksWhenSilenceAndLanguageSwitch() : silenceLineBreak;
      }
    }
    return languageSwitched ? languageSwitchLineBreak : "";
  }

  /** Returns the String contains more new line breaks between language switch and silence. */
  private String getLineBreaksWhenSilenceAndLanguageSwitch() {
    if (options.getNumExtendedSilenceLineBreaks() >= options.getNumLanguageSwitchLineBreaks()) {
      return silenceLineBreak;
    }
    return languageSwitchLineBreak;
  }

  private static String getLanguageWithoutDialect(String languageCode) {
    if (TextUtils.isEmpty(languageCode)) {
      return "";
    }
    return languageCode.split("-", -1)[0];
  }

  /**
   * Returns string with Hiragana only if language is Japanese. Otherwise, returned string is with
   * any leading and trailing whitespace removed.
   */
  private static String formatWord(String languageCode, String word) {
    String language = getLanguageWithoutDialect(languageCode);
    if ("ja".equalsIgnoreCase(language)) {
      // Japanese ASR results could contain two parts per word, the former would be one of
      // Hiragana, Katakana, or Kanji, and the latter would be Katakana or none. Here extract
      // the former.
      return word.split(JAPANESE_SPLITTER_REGEX, -1)[0];
    }
    return word.trim();
  }

  /**
   * If the word occurs as a substring within the rawTranscript, then the substring starting from
   * the last occurrence of the word and extends to the end is added to intermediateBuilder. We
   * assume the transcript is formatted perfectly, and then we don't worry about the word divider
   * between words for all languages if we construct the transcript by words level detail.
   */
  private static boolean checkWordExistedThenAdd(
      StringBuilder rawTranscript, StringBuilder intermediateBuilder, String word) {
    int index = rawTranscript.lastIndexOf(word);
    if (index == -1) {
      return false;
    }
    String transcriptToTheEnd = rawTranscript.substring(index);
    intermediateBuilder.insert(0, transcriptToTheEnd);
    rawTranscript.delete(index, rawTranscript.length());
    return true;
  }

  /**
   * Generates a Spannable with text formatted at the word level.
   *
   * @param wholeStringTranscript the whole transcript, formatted to have no leading spaces and a
   *     single trailing space
   * @param languageCode string language code, for example "en-us" or "ja"
   * @param words the list of words contained in wholeStringTranscript
   * @param colorFunction maps a word to a hex color
   */
  private Spannable addPerWordColoredStringToResult(
      String wholeStringTranscript,
      String languageCode,
      List<TranscriptionResult.Word> words,
      boolean precededByLineBreak,
      ColorByWordFunction colorFunction) {
    StringBuilder rawTranscript = new StringBuilder(wholeStringTranscript);
    boolean wordFound = false;
    String color = "";
    SpannableStringBuilder spannableStringBuilder = new SpannableStringBuilder();
    StringBuilder intermediateBuilder = new StringBuilder();
    // Group adjacent words of the same color within the same span tag.
    // Traverse in reverse then a space divider will be at the end of word.
    List<TranscriptionResult.Word> reverseWords = Lists.reverse(words);
    for (int wordIndex = 0; wordIndex < reverseWords.size(); ++wordIndex) {
      TranscriptionResult.Word word = reverseWords.get(wordIndex);

      String nextColor = colorFunction.getColor(word);
      if (wordFound) {
        if (!color.equals(nextColor)) {
          spannableStringBuilder.insert(
              0, makeColoredString(intermediateBuilder.toString(), color));
          intermediateBuilder = new StringBuilder();
          wordFound = false;
        }

        if (options.getSpeakerIndicationStyle() == SpeakerIndicationStyle.SHOW_SPEAKER_NUMBER) {
          // If the speaker has changed or if the text was preceded by a space, add a chevron.
          int previousSpeaker = reverseWords.get(wordIndex - 1).getSpeakerInfo().getSpeakerId();
          if (word.getSpeakerInfo().getSpeakerId() != previousSpeaker) {
            boolean needsAdditionalNewline = previousSpeaker != -1 && !precededByLineBreak;
            intermediateBuilder.insert(
                0,
                newSpeakerChevron(
                    reverseWords.get(wordIndex - 1).getSpeakerInfo().getSpeakerId(),
                    needsAdditionalNewline));

            spannableStringBuilder.insert(
                0, makeColoredString(intermediateBuilder.toString(), color));
            intermediateBuilder = new StringBuilder();
            wordFound = false;
          }
        }
      }
      // We'll try to find previous word if we can't find current word in the rawTranscript.
      // Append the string started from the word to the end if found.
      wordFound |=
          checkWordExistedThenAdd(
              rawTranscript, intermediateBuilder, formatWord(languageCode, word.getText()));
      color = nextColor;
    }
    boolean forceChevron =
        precededByLineBreak || words.get(0).getSpeakerInfo().getSpeakerId() != lastSpeakerId;
    intermediateBuilder.insert(0, rawTranscript.toString());
    if (options.getSpeakerIndicationStyle() == SpeakerIndicationStyle.SHOW_SPEAKER_NUMBER
        && intermediateBuilder.length() != 0
        && forceChevron) {
      intermediateBuilder.insert(
          0,
          newSpeakerChevron(
              words.get(0).getSpeakerInfo().getSpeakerId(),
              lastSpeakerId != -1 && !precededByLineBreak));
    }
    spannableStringBuilder.insert(0, makeColoredString(intermediateBuilder.toString(), color));
    return SpannableString.valueOf(spannableStringBuilder);
  }

  /**
   * Generates a {@link SpannableString} containing a colored string.
   *
   * @param message a string to append to cachedFinalizedResult
   * @param color a six-character hex string beginning with a pound sign
   */
  private SpannableString makeColoredString(String message, String color) {
    int textColor = Color.parseColor(color);
    SpannableString spannableString = new SpannableString(message);
    spannableString.setSpan(
        new ForegroundColorSpan(textColor),
        0,
        spannableString.length(),
        Spannable.SPAN_EXCLUSIVE_EXCLUSIVE);
    return spannableString;
  }

  /**
   * Get a string hex color associated with a confidence value on the range [0, 1] according to the
   * confidence in {@link TranscriptionResult}.
   */
  private String getColorFromConfidence(TranscriptionResult result) {
    if (result.hasConfidence()) {
      return getColorFromConfidence(result.getConfidence());
    }
    return getDefaultColorFromTheme();
  }

  /**
   * Get a string hex color associated with a confidence value on the range [0, 1] according to
   * specified confidence.
   */
  private String getColorFromConfidence(float confidence) {
    ImmutableList<String> colormap = getColorList(options.getTextColormap());
    for (int i = 0; i < UPPER_CONFIDENCE_THRESHOLDS.size(); ++i) {
      if (confidence <= UPPER_CONFIDENCE_THRESHOLDS.get(i)) {
        return colormap.get(i);
      }
    }
    // Won't happen because upper bound of UPPER_CONFIDENCE_THRESHOLDS is infinity.
    return getDefaultColorFromTheme();
  }

  /** Returns the hex code of the default text color according to the theme. */
  private String getDefaultColorFromTheme() {
    switch (options.getTextColormap()) {
      case DARK_THEME:
        return WHITE;
      case LIGHT_THEME:
      case UNSPECIFIED_THEME:
        return BLACK;
    }
    return WHITE;
  }

  /**
   * Get a string hex color associated with the speaker number. Currently this supports up to 4
   * speakers.
   */
  private String getColorFromSpeakerId(int speakerID) {
    return SPEAKER_ID_COLORS.get(speakerID % SPEAKER_ID_COLORS.size());
  }

  private static ImmutableList<String> getColorList(TextColormap colormap) {
    switch (colormap) {
      case LIGHT_THEME:
      case UNSPECIFIED_THEME:
        return LIGHT_THEME_COLORS;
      case DARK_THEME:
        return DARK_THEME_COLORS;
    }
    return DARK_THEME_COLORS;
  }

  private static String newSpeakerChevron(int tag, boolean includesNewline) {
    return (includesNewline ? "\n≫ " : "≫ ") + Integer.toString(tag) + ": ";
  }

  private static int getLastSpeakerIdTag(TranscriptionResult result) {
    if (result.getWordLevelDetailCount() == 0) {
      return result.getSpeakerInfo().getSpeakerId();
    } else {
      return result
          .getWordLevelDetailList()
          .get(result.getWordLevelDetailCount() - 1)
          .getSpeakerInfo()
          .getSpeakerId();
    }
  }
}