Sets speech language. Acceptable language codes are "en" (English), "ja" (Japanese), "zh-CN" (Mandarin Chinese), "zh-HK" (Cantonese), and "zh-TW" (Chinese).

SET_SAMPLING_RATE

Sets audio stream sampling rate. The recommand value is 16000.

END_STREAM

Signifies the end of audio stream. No values required.

END_SESSION

Signifies the end of speech recognition session. No values required. Once the session is ended, the WebSocket connection is closed.

Audio stream

Audio stream in binary form should be sent over the WebSocket connection. The audio stream must have exactly 1 channel, 16 bit per second, and in WAV format.

Response

Response type

Description

LANGUAGE_READY

The speech language has been set successfully.

SAMPLING_RATE_READY

The audio stream sampling rate has been set successfully.

RECOGNITION_RESULT

The recognized transcript from the audio stream. This type of response may be sent multiple times during the transfer of the audio stream.

RECOGNITION_ERROR

The error message from the speech recognizer. This type of response may be sent multiple times.

api/v1/translate/stt-streaming

Send audio stream for speech recognition on-the-fly.

JavaScript

const fs = require('fs');
const WebSocket = require('ws');

const path = require('path');

const authUtils = require('./utils/auth-utils');
const fsUtils = require('./utils/fs-utils');

const fsPromise = fs.promises;

const apiPath = '/api/v1/translate/stt-streaming';
const apiEndpoint = `wss://translate.rozetta-api.io${apiPath}`;
const authConfig = {
  accessKey: 'ACCESS_KEY',
  secretKey: 'SECRET_KEY',
  nonce: Date.now().toString(),
  contractId: 'CONTRACT_ID',
};
const speechData = {
  language: 'ja',
  samplingRate: 16000,
  audioFile: 'speech.wav',
  audioBuffer: null,
};

/**
* Command type sent from the client.
*/
const commandType = {
  setLanguage: 'SET_LANGUAGE',
  setSamplingRate: 'SET_SAMPLING_RATE',
  endStream: 'END_STREAM',
  endSession: 'END_SESSION',
};

/**
* Response types received from API endpoint.
*/
const responseType = {
  languageReady: 'LANGUAGE_READY',
  samplingRateReady: 'SAMPLING_RATE_READY',
  recognitionResult: 'RECOGNITION_RESULT',
  recognitionError: 'RECOGNITION_ERROR',
};

const getAuth = (url) => {
  const nonce = Date.now().toString();
  return {
      accessKey: authConfig.accessKey,
      nonce: nonce,
      signature: generateSignature(url, authConfig.secretKey, nonce),
      remoteurl: url,
      contractId: authConfig.contractId
  }
}

const handleSessionMessage = (connection, message) => {
  const messageJSON = JSON.parse(message);
  switch (messageJSON.type) {
    case responseType.languageReady:
      // The language is set. Set the sampling rate.
      console.log('Language is set. Set sampling rate.');
      connection.send(JSON.stringify({
        command: commandType.setSamplingRate,
        value: speechData.samplingRate,
      }));
      break;
    case responseType.samplingRateReady:
      // The language is set. Send the audio data stream.
      console.log('Sampling rate is set. Send audio data stream.');
      connection.send(speechData.audioBuffer);
      connection.send(JSON.stringify({
        command: commandType.endStream,
      }));
      break;
    case responseType.recognitionResult:
      console.log('Recognized transcript:');
      console.log(messageJSON.value);
      break;
    case responseType.recognitionError:
      console.error('Recognition error:');
      console.error(messageJSON.value);
      // In case of error, we close the connection immediately.
      connection.send(JSON.stringify({
        command: commandType.endSession,
      }));
      break;
    default:
      console.log('Unexpected response type:');
      console.log(messageJSON.type);
  }
};

const main = async () => {
  speechData.audioBuffer = await fsPromise.readFile(speechData.audioFile);
  const auth = getAuth(apiPath);
  console.log(apiPath);
  console.log(auth);
  const auth64 = btoa(JSON.stringify(auth));
  const url = `${apiEndpoint}?auth=${auth64}`
  console.log(url);
  const connection = new WebSocket(url);
  connection.on('open', () => {
    console.log('Connected to streaming STT API.');
    // Once connected, set the speech language.
    connection.send(JSON.stringify({
      command: commandType.setLanguage,
      value: speechData.language,
    }));
  });
  connection.on('message', (message) => {
    handleSessionMessage(connection, message);
  });
  connection.on('error', (error) => {
    console.error(error.message);
    connection.close();
  });
  connection.on('close', () => {
    console.log('Connection closed.');
  });
};

main();

About the authentication, please refer to the authentication section.

You can get a full version of sample codes here