psychojs/src/sound/AudioClip.js

/**
 * AudioClip encapsulate an audio recording.
 *
 * @author Alain Pitiot and Sotiri Bakagiannis
 * @version 2021.x
 * @copyright (c) 2021 Open Science Tools Ltd. (https://opensciencetools.org)
 * @license Distributed under the terms of the MIT License
 */

import {PsychObject} from '../util/PsychObject';
import {PsychoJS} from '../core/PsychoJS';
import {ExperimentHandler} from '../data/ExperimentHandler';
import * as util from '../util/Util';


/**
 * <p>AudioClip encapsulate an audio recording.</p>
 *
 * @name module:sound.AudioClip
 * @class
 * @param {Object} options
 * @param {module:core.PsychoJS} options.psychoJS - the PsychoJS instance
 * @param {String} [options.name= 'audioclip'] - the name used when logging messages
 * @param {string} options.format the format for the audio file
 * @param {number} options.sampleRateHz - the sampling rate
 * @param {Blob} options.data - the audio data, in the given format, at the given sampling rate
 * @param {boolean} [options.autoLog= false] - whether or not to log
 */
export class AudioClip extends PsychObject
{

	constructor({psychoJS, name, sampleRateHz, format, data, autoLog} = {})
	{
		super(psychoJS);

		this._addAttribute('name', name, 'audioclip');
		this._addAttribute('format', format);
		this._addAttribute('sampleRateHz', sampleRateHz);
		this._addAttribute('data', data);
		this._addAttribute('autoLog', false, autoLog);
		this._addAttribute('status', AudioClip.Status.CREATED);

		if (this._autoLog)
		{
			this._psychoJS.experimentLogger.exp(`Created ${this.name} = ${this.toString()}`);
		}

		// decode the blob into an audio buffer:
		this._decodeAudio();
	}


	/**
	 * Play the audio clip.
	 *
	 * @name module:sound.AudioClip#upload
	 * @function
	 * @public
	 */
	async play()
	{
		this._psychoJS.logger.debug('request to play the audio clip');

		// wait for the decoding to complete:
		await this._decodeAudio();

		// play the audio buffer:
		const source = this._audioContext.createBufferSource();
		source.buffer = this._audioBuffer;
		source.connect(this._audioContext.destination);
		source.start();
	}


	/**
	 * Upload the audio clip to the pavlovia server.
	 *
	 * @name module:sound.AudioClip#upload
	 * @function
	 * @public
	 */
	upload()
	{
		this._psychoJS.logger.debug('request to upload the audio clip to pavlovia.org');

		// add a format-dependent audio extension to the name:
		const filename = this._name + util.extensionFromMimeType(this._format);


		// if the audio recording cannot be uploaded, e.g. the experiment is running locally, or
		// if it is piloting mode, then we offer the audio clip as a file for download:
		if (this._psychoJS.getEnvironment() !== ExperimentHandler.Environment.SERVER ||
			this._psychoJS.config.experiment.status !== 'RUNNING' ||
			this._psychoJS._serverMsg.has('__pilotToken'))
		{
			return this.download(filename);
		}

		// upload the data:
		return this._psychoJS.serverManager.uploadAudio(this._data, filename);
	}


	/**
	 * Offer the audio clip to the participant as a sound file to download.
	 *
	 * @name module:sound.AudioClip#download
	 * @function
	 * @public
	 */
	download(filename = 'audio.webm')
	{
		const anchor = document.createElement('a');
		anchor.href = window.URL.createObjectURL(this._data);
		anchor.download = filename;
		document.body.appendChild(anchor);
		anchor.click();
		document.body.removeChild(anchor);
	}


	/**
	 * Transcribe the audio clip.
	 *
	 * ref: https://cloud.google.com/speech-to-text/docs/reference/rest/v1/speech/recognize
	 *
	 * @param {Object} options
	 * @param engine
	 * @param {String} options.languageCode - the BCP-47 language code for the recognition,
	 * 	e.g. 'en-gb'
	 * @return {Promise<void>}
	 */
	async transcribe({engine, languageCode, key} = {})
	{
		this._psychoJS.logger.debug('request to transcribe the audio clip');

		// wait for the decoding to complete:
		await this._decodeAudio();

		return new Promise(async (resolve, reject) =>
		{
			// convert the Float32 PCM audio data to UInt16:
			const buffer = new ArrayBuffer(this._audioData.length * 2);
			const uint16View = new Uint16Array(buffer);
			for (let t = 0; t < this._audioData.length; ++t)
			{
				uint16View[t] = (this._audioData[t] < 0)
					? this._audioData[t] * 0x8000
					: this._audioData[t] * 0x7FFF;
			}

			// encode it to base64:
			const base64Data = this._base64ArrayBuffer(new Uint8Array(buffer));

			// query the Google speech-to-text service:
			const body = {
				config: {
					encoding: 'LINEAR16',
					sampleRateHertz: this._sampleRateHz,
					languageCode
				},
				audio: {
					content: base64Data
				},
			};

			// TODO get the key from the designer's pavlovia account
			const url = `https://speech.googleapis.com/v1/speech:recognize?key=${key}`;

			const response = await fetch(url, {
				method: 'POST',
				headers: {
					'Content-Type': 'application/json',
				},
				body: JSON.stringify(body)
			});

			// convert the response to json:
			const decodedResponse = await response.json();
			this._psychoJS.logger.debug('speech.googleapis.com response:', JSON.stringify(decodedResponse));

			// TODO deal with more than one results and/or alternatives
			resolve(decodedResponse.results[0].alternatives[0]);
		});
	}


	/**
	 * Decode the formatted audio data (e.g. webm) into a 32bit float PCM audio buffer.
	 *
	 * @returns {Promise<unknown>}
	 * @private
	 */
	_decodeAudio()
	{
		this._psychoJS.logger.debug('request to decode the data of the audio clip');

		// if the audio clip is ready, the PCM audio data is available in _audioData, a Float32Array:
		if (this._status === AudioClip.Status.READY)
		{
			return;
		}


		// if we are already decoding, wait until the process completed:
		if (this._status === AudioClip.Status.DECODING)
		{
			const self = this;
			return new Promise(function (resolve, reject)
			{
				self._decodingCallbacks.push(resolve);

				// self._errorCallback = reject; // TODO
			}.bind(this));
		}


		// otherwise, start decoding the input formatted audio data:
		this._status = AudioClip.Status.DECODING;
		this._audioData = null;
		this._decodingCallbacks = [];

		this._audioContext = new (window.AudioContext || window.webkitAudioContext)({
			sampleRate: this._sampleRateHz
		});

		const reader = new window.FileReader();
		reader.onloadend = async () =>
		{
			try
			{
				// decode the ArrayBuffer containing the formatted audio data (e.g. webm)
				// into an audio buffer:
				this._audioBuffer = await this._audioContext.decodeAudioData(reader.result);

				// get the Float32Array containing the PCM data:
				this._audioData = this._audioBuffer.getChannelData(0);

				// we are now ready to translate and play:
				this._status = AudioClip.Status.READY;

				// resolve all the promises waiting for the decoding to complete:
				for (const callback of this._decodingCallbacks)
				{
					callback();
				}
			}
			catch (error)
			{
				console.error(error);

				// TODO
			}
		};

		reader.onerror = (error) =>
		{
			// TODO
		}

		reader.readAsArrayBuffer(this._data);
	}


	/**
	 * Convert an array buffer to a base64 string.
	 *
	 * @note this is only very lightly adapted from the folowing post of @Grantlyk:
	 * https://gist.github.com/jonleighton/958841#gistcomment-1953137
	 *
	 * the following only works for small buffers:
	 * const dataAsString = String.fromCharCode.apply(null, new Uint8Array(buffer));
	 * base64Data = window.btoa(dataAsString);
	 *
	 * @param arrayBuffer
	 * @return {string} the base64 encoded input buffer
	 */
	_base64ArrayBuffer(arrayBuffer)
	{
	let base64 = '';
	const encodings = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';

	const bytes = new Uint8Array(arrayBuffer);
	const byteLength = bytes.byteLength;
	const byteRemainder = byteLength % 3;
	const mainLength = byteLength - byteRemainder;

	let a;
	let b;
	let c;
	let d;
	let chunk;

	// Main loop deals with bytes in chunks of 3
	for (let i = 0; i < mainLength; i += 3) {
		// Combine the three bytes into a single integer
		chunk = (bytes[i] << 16) | (bytes[i + 1] << 8) | bytes[i + 2];

		// Use bitmasks to extract 6-bit segments from the triplet
		a = (chunk & 16515072) >> 18; // 16515072 = (2^6 - 1) << 18
		b = (chunk & 258048) >> 12; // 258048   = (2^6 - 1) << 12
		c = (chunk & 4032) >> 6; // 4032     = (2^6 - 1) << 6
		d = chunk & 63;        // 63       = 2^6 - 1

		// Convert the raw binary segments to the appropriate ASCII encoding
		base64 += encodings[a] + encodings[b] + encodings[c] + encodings[d];
	}

	// Deal with the remaining bytes and padding
	if (byteRemainder === 1) {
		chunk = bytes[mainLength];

		a = (chunk & 252) >> 2; // 252 = (2^6 - 1) << 2

		// Set the 4 least significant bits to zero
		b = (chunk & 3) << 4; // 3   = 2^2 - 1

		base64 += `${encodings[a]}${encodings[b]}==`;
	} else if (byteRemainder === 2) {
		chunk = (bytes[mainLength] << 8) | bytes[mainLength + 1];

		a = (chunk & 64512) >> 10; // 64512 = (2^6 - 1) << 10
		b = (chunk & 1008) >> 4; // 1008  = (2^6 - 1) << 4

		// Set the 2 least significant bits to zero
		c = (chunk & 15) << 2; // 15    = 2^4 - 1

		base64 += `${encodings[a]}${encodings[b]}${encodings[c]}=`;
	}

	return base64;
}

}


/**
 * Recognition engines.
 *
 * @name module:sound.AudioClip#Engine
 * @enum {Symbol}
 * @readonly
 * @public
 */
AudioClip.Engine = {
	/**
	 * Google Cloud Speech-to-Text.
	 */
	GOOGLE: Symbol.for('GOOGLE')
};


/**
 * AudioClip status.
 *
 * @enum {Symbol}
 * @readonly
 * @public
 */
AudioClip.Status = {
	CREATED: Symbol.for('CREATED'),

	DECODING: Symbol.for('DECODING'),

	READY: Symbol.for('READY')
};