/* * Copyright 2011-2016, Institute of Cybernetics at Tallinn University of Technology * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ee.ioc.phon.android.speechutils; import android.media.AudioFormat; import java.util.concurrent.atomic.AtomicLong; import ee.ioc.phon.android.speechutils.utils.AudioUtils; public abstract class AbstractAudioRecorder implements AudioRecorder { private static final int RESOLUTION = AudioFormat.ENCODING_PCM_16BIT; private static final int BUFFER_SIZE_MULTIPLIER = 4; // was: 2 private static final int DEFAULT_BUFFER_LENGTH_IN_MILLIS = 35000; private SpeechRecord mRecorder = null; private double mAvgEnergy = 0; private final int mSampleRate; private final int mSamplesInOneSec; private final int mSamplesInOneMilliSec; private final boolean mAlwaysListen; // Recorder state private State mState; // The complete space into which the recording in written. // Its maximum length is about: // 2 (bytes) * 1 (channels) * 30 (max rec time in seconds) * 44100 (times per second) = 2 646 000 bytes // but typically is: // 2 (bytes) * 1 (channels) * 20 (max rec time in seconds) * 16000 (times per second) = 640 000 bytes final byte[] mRecording; // TODO: use: mRecording.length instead private int mRecordedLength = 0; private AtomicLong mRecordedSessionId = new AtomicLong(0L); boolean mRecordingBufferIsFullWithData = false; private final int mRecordingBufferLengthMillis; // The number of bytes the client has already consumed private int mConsumedLength = 0; private AtomicLong mConsumedSessionId = new AtomicLong(0L); // Buffer for output private byte[] mBuffer; protected AbstractAudioRecorder(int audioSource, int sampleRate, int recordingBufferLengthMillis, boolean alwaysListen) { mSampleRate = sampleRate; // E.g. 1 second of 16kHz 16-bit mono audio takes 32000 bytes. mSamplesInOneSec = RESOLUTION_IN_BYTES * CHANNELS * mSampleRate; mSamplesInOneMilliSec = (int) ((double) mSamplesInOneSec / 1000.0); mRecordingBufferLengthMillis = recordingBufferLengthMillis; mRecording = new byte[mSamplesInOneMilliSec * mRecordingBufferLengthMillis]; mAlwaysListen = alwaysListen; } protected AbstractAudioRecorder(int audioSource, int sampleRate) { this(audioSource, sampleRate, DEFAULT_BUFFER_LENGTH_IN_MILLIS, false); } protected SpeechRecord createRecorder(int audioSource, int sampleRate, int bufferSize) { if (mRecorder != null) release(); mRecorder = new SpeechRecord(audioSource, sampleRate, AudioFormat.CHANNEL_IN_MONO, RESOLUTION, bufferSize, false, false, false); if (getSpeechRecordState() != SpeechRecord.STATE_INITIALIZED) { throw new IllegalStateException("SpeechRecord initialization failed"); } return mRecorder; } // TODO: remove protected void createBuffer(int framePeriod) { mBuffer = new byte[framePeriod * RESOLUTION_IN_BYTES * CHANNELS]; } protected int getBufferSize() { int minBufferSizeInBytes = SpeechRecord.getMinBufferSize(mSampleRate, AudioFormat.CHANNEL_IN_MONO, RESOLUTION); if (minBufferSizeInBytes == SpeechRecord.ERROR_BAD_VALUE) { throw new IllegalArgumentException("SpeechRecord.getMinBufferSize: parameters not supported by hardware"); } else if (minBufferSizeInBytes == SpeechRecord.ERROR) { Log.e("SpeechRecord.getMinBufferSize: unable to query hardware for output properties"); minBufferSizeInBytes = mSampleRate * (120 / 1000) * RESOLUTION_IN_BYTES * CHANNELS; } int bufferSize = BUFFER_SIZE_MULTIPLIER * minBufferSizeInBytes; Log.i("SpeechRecord buffer size: " + bufferSize + ", min size = " + minBufferSizeInBytes); return bufferSize; } /** * Returns the recorded bytes since the last call, and resets the recording. * * @return bytes that have been recorded since this method was last called */ public synchronized byte[] consumeRecordingAndTruncate() { int len = getConsumedLength(); byte[] bytes = getCurrentRecording(len); setRecordedLength(0); setConsumedLength(0); return bytes; } public int getSampleRate() { return mSampleRate; } protected int getNumOfSamplesIn(int millis) { return Math.abs(millis) * mSamplesInOneMilliSec; } protected boolean isRecordedSessionSameAsConsumedSession() { return mRecordedSessionId.get() == mConsumedSessionId.get(); } /** * Checking of the read status. * The total recording array has been pre-allocated (e.g. for 35 seconds of audio). * If it gets full (status == -5) then the recording is stopped. */ protected int getStatus(int numOfBytes, int len) { Log.i("Read bytes: request/actual: " + len + "/" + numOfBytes); if (numOfBytes < 0) { Log.e("AudioRecord error: " + numOfBytes); return numOfBytes; } if (numOfBytes > len) { Log.e("Read more bytes than is buffer length: " + numOfBytes + ": " + len); return -100; } else if (numOfBytes == 0) { Log.e("Read zero bytes"); return -200; } else if (mRecording.length < mRecordedLength + numOfBytes) { Log.e("Recorder buffer overflow: " + mRecordedLength); return -300; } return 0; } /** * Check if the consume pointer was crossed by the recorded pointer. As long as the consume * pointer was not crossed, the consumption of the buffer may continue as usual and no sound gap * will occur. Once the consume pointer was crossed (e.g. it was on sample 1000 and prior to this * read the recorder was on sample 750 and now that it read the new sample it's on 1500), there's * an audio gap between the consumer and the recorder that can not be filled (data is lost with * no ability to get it back). Whenever this kind of cross occurs, the calling code changed the * session id of the recorder so that if consume is called (from ContinuousRawAudioRecorder), * it will not assume that the data is complete and could be fetched but it will act according to * the SessionStartPointer configured (e.g. read the buffer from the beginning, from now, or from * now - X millis) * * @param reachedTheEndOfRecordingBuffer - in case that in the read before the call to this method the recorder * passed the end of the buffer and returned to the beginning * @param numOfBytesRead - in the reading process * @return true/false according to the above logic */ private boolean isConsumePointerCrossed(boolean reachedTheEndOfRecordingBuffer, int numOfBytesRead) { return numOfBytesRead > 0 && mRecordingBufferIsFullWithData && isRecordedSessionSameAsConsumedSession() && ((mRecordedLength - numOfBytesRead < mConsumedLength && mRecordedLength >= mConsumedLength) || (reachedTheEndOfRecordingBuffer && mConsumedLength < mRecordedLength)); } public long markNewRecordingSession() { return mRecordedSessionId.incrementAndGet(); } /** * Copy data from the given recorder into the given buffer, and append to the complete recording. * public int read (byte[] audioData, int offsetInBytes, int sizeInBytes) */ protected int read(SpeechRecord recorder, byte[] buffer) { int len = buffer.length; int numOfBytes = recorder.read(buffer, 0, len); // handling mediaserver crashes here // it doesn't happen a lot but it happens and the way to handle it is to fully restart // the audio recorder if (numOfBytes == 0 && mAlwaysListen) { consumeRecordingAndTruncate(); mBuffer = new byte[mBuffer.length]; createRecorder(recorder.getAudioSource(), recorder.getSampleRate(), getBufferSize()); start(); } int status = getStatus(numOfBytes, len); boolean reachedTheEndOfRecordingBuffer = false; // if we need to keep on listening, when reaching the end of the recorded buffer, // continue to write from the beginning. thus, we have a cyclic buffer if (mAlwaysListen && status == -300) { reachedTheEndOfRecordingBuffer = true; status = 0; // for use when consuming the recorded buffer, the buffer is now in it's cyclic phase if (!mRecordingBufferIsFullWithData) mRecordingBufferIsFullWithData = true; } if (status == 0 && numOfBytes >= 0) { if (!reachedTheEndOfRecordingBuffer) { // arraycopy(Object src, int srcPos, Object dest, int destPos, int length) // numOfBytes <= len, typically == len, but at the end of the recording can be < len. System.arraycopy(buffer, 0, mRecording, mRecordedLength, numOfBytes); mRecordedLength += numOfBytes; } else { int numOfBytesBeforeCyclic = mRecording.length - mRecordedLength; System.arraycopy(buffer, 0, mRecording, mRecordedLength, numOfBytesBeforeCyclic); System.arraycopy(buffer, numOfBytesBeforeCyclic, mRecording, 0, numOfBytes - numOfBytesBeforeCyclic); mRecordedLength = numOfBytes - numOfBytesBeforeCyclic; } // increment the recorded session id in case that the consume pointer was crossed if (isConsumePointerCrossed(reachedTheEndOfRecordingBuffer, numOfBytes)) { Log.i("recorder session changed. mRecordedLength was: " + (mRecordedLength - numOfBytes) + " and now it is: " + mRecordedLength + " while the mConsumedLength is: " + mConsumedLength); markNewRecordingSession(); } } return mAlwaysListen ? 0 : status; } /** * @return recorder state */ public State getState() { return mState; } protected void setState(State state) { mState = state; } /** * @return bytes that have been recorded since the beginning */ public byte[] getCompleteRecording() { return getCurrentRecording(0); } /** * @return bytes that have been recorded since the beginning, with wav-header */ public byte[] getCompleteRecordingAsWav() { return getRecordingAsWav(getCompleteRecording(), mSampleRate); } public static byte[] getRecordingAsWav(byte[] pcm, int sampleRate) { return AudioUtils.getRecordingAsWav(pcm, sampleRate, RESOLUTION_IN_BYTES, CHANNELS); } /** * @return bytes that have been recorded since this method was last called */ public synchronized byte[] consumeRecording() { byte[] bytes = getCurrentRecording(mConsumedLength); if (bytes == null) return null; // this is to avoid race (set the consumed length to be the recorded length though // the last recording was empty while recorded length moved on - thus we always miss // a part of the recording) mConsumedLength = mRecordedLength; mConsumedSessionId.set(mRecordedSessionId.get()); return bytes; } protected byte[] getCurrentRecording(int startPos) { int len = getLength() - startPos; byte[] bytes = new byte[len]; System.arraycopy(mRecording, startPos, bytes, 0, len); Log.i("Copied (raw) from pos: " + startPos + ", bytes: " + bytes.length); return bytes; } protected int getConsumedLength() { return mConsumedLength; } protected void setConsumedLength(int len) { mConsumedLength = len; } protected void setRecordedLength(int len) { mRecordedLength = len; } public int getLength() { return mRecordedLength; } /** * @return <code>true</code> iff a speech-ending pause has occurred at the end of the recorded data */ public boolean isPausing() { double pauseScore = getPauseScore(); Log.i("Pause score: " + pauseScore); return pauseScore > 7; } /** * @return volume indicator that shows the average volume of the last read buffer */ public float getRmsdb() { long sumOfSquares = getRms(mRecordedLength, mBuffer.length); double rootMeanSquare = Math.sqrt(sumOfSquares / (mBuffer.length / 2)); if (rootMeanSquare > 1) { // TODO: why 10? return (float) (10 * Math.log10(rootMeanSquare)); } return 0; } /** * <p>In order to calculate if the user has stopped speaking we take the * data from the last second of the recording, map it to a number * and compare this number to the numbers obtained previously. We * return a confidence score (0-INF) of a longer pause having occurred in the * speech input.</p> * <p/> * <p>TODO: base the implementation on some well-known technique.</p> * * @return positive value which the caller can use to determine if there is a pause */ private double getPauseScore() { long t2 = getRms(mRecordedLength, mSamplesInOneSec); if (t2 == 0) { return 0; } double t = mAvgEnergy / t2; mAvgEnergy = (2 * mAvgEnergy + t2) / 3; return t; } /** * <p>Stops the recording (if needed) and releases the resources. * The object can no longer be used and the reference should be * set to null after a call to release().</p> */ public synchronized void release() { if (mRecorder != null) { if (mRecorder.getRecordingState() == SpeechRecord.RECORDSTATE_RECORDING) { stop(); } mRecorder.release(); mRecorder = null; } } /** * <p>Starts the recording, and sets the state to RECORDING.</p> */ public void start() { if (getSpeechRecordState() == SpeechRecord.STATE_INITIALIZED) { mRecorder.startRecording(); if (mRecorder.getRecordingState() == SpeechRecord.RECORDSTATE_RECORDING) { setState(State.RECORDING); new Thread() { public void run() { recorderLoop(mRecorder); } }.start(); } else { handleError("startRecording() failed"); } } else { handleError("start() called on illegal state"); } } /** * <p>Stops the recording, and sets the state to STOPPED. * If stopping fails then sets the state to ERROR.</p> */ public void stop() { // We check the underlying SpeechRecord state trying to avoid IllegalStateException. // If it still occurs then we catch it. if (getSpeechRecordState() == SpeechRecord.STATE_INITIALIZED && mRecorder.getRecordingState() == SpeechRecord.RECORDSTATE_RECORDING) { try { mRecorder.stop(); setState(State.STOPPED); } catch (IllegalStateException e) { handleError("native stop() called in illegal state: " + e.getMessage()); } } else { handleError("stop() called in illegal state"); } } protected void recorderLoop(SpeechRecord recorder) { while (recorder.getRecordingState() == SpeechRecord.RECORDSTATE_RECORDING) { int status = read(recorder, mBuffer); if (status < 0) { handleError("status = " + status); break; } } } private long getRms(int end, int span) { int begin = end - span; if (begin < 0) { begin = 0; } // make sure begin is even if (0 != (begin % 2)) { begin++; } long sum = 0; for (int i = begin; i < end; i += 2) { short curSample = getShort(mRecording[i], mRecording[i + 1]); sum += curSample * curSample; } return sum; } /* * Converts two bytes to a short (assuming little endian). * TODO: We don't need the whole short, just take the 2nd byte (the more significant one) * TODO: Most Android devices are little endian? */ private static short getShort(byte argB1, byte argB2) { //if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) { // return (short) ((argB1 << 8) | argB2); //} return (short) (argB1 | (argB2 << 8)); } protected void handleError(String msg) { release(); setState(State.ERROR); Log.e(msg); } private int getSpeechRecordState() { if (mRecorder == null) { return SpeechRecord.STATE_UNINITIALIZED; } return mRecorder.getState(); } }