ElevenLabs Speech-to-Text V2

curl --request POST \
  --url https://api.highwayapi.ai/v3/elevenlabs-scribe-v2 \
  --header 'Authorization: <authorization>' \
  --header 'Content-Type: <content-type>' \
  --data '
{
  "seed": 123,
  "diarize": true,
  "file_format": "<string>",
  "temperature": 123,
  "num_speakers": 123,
  "language_code": "<string>",
  "tag_audio_events": true,
  "cloud_storage_url": "<string>",
  "use_multi_channel": true,
  "diarization_threshold": 123,
  "timestamps_granularity": "<string>"
}
'

import requests

url = "https://api.highwayapi.ai/v3/elevenlabs-scribe-v2"

payload = {
    "seed": 123,
    "diarize": True,
    "file_format": "<string>",
    "temperature": 123,
    "num_speakers": 123,
    "language_code": "<string>",
    "tag_audio_events": True,
    "cloud_storage_url": "<string>",
    "use_multi_channel": True,
    "diarization_threshold": 123,
    "timestamps_granularity": "<string>"
}
headers = {
    "Content-Type": "<content-type>",
    "Authorization": "<authorization>"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {'Content-Type': '<content-type>', Authorization: '<authorization>'},
  body: JSON.stringify({
    seed: 123,
    diarize: true,
    file_format: '<string>',
    temperature: 123,
    num_speakers: 123,
    language_code: '<string>',
    tag_audio_events: true,
    cloud_storage_url: '<string>',
    use_multi_channel: true,
    diarization_threshold: 123,
    timestamps_granularity: '<string>'
  })
};

fetch('https://api.highwayapi.ai/v3/elevenlabs-scribe-v2', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.highwayapi.ai/v3/elevenlabs-scribe-v2",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'seed' => 123,
    'diarize' => true,
    'file_format' => '<string>',
    'temperature' => 123,
    'num_speakers' => 123,
    'language_code' => '<string>',
    'tag_audio_events' => true,
    'cloud_storage_url' => '<string>',
    'use_multi_channel' => true,
    'diarization_threshold' => 123,
    'timestamps_granularity' => '<string>'
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: <authorization>",
    "Content-Type: <content-type>"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.highwayapi.ai/v3/elevenlabs-scribe-v2"

	payload := strings.NewReader("{\n  \"seed\": 123,\n  \"diarize\": true,\n  \"file_format\": \"<string>\",\n  \"temperature\": 123,\n  \"num_speakers\": 123,\n  \"language_code\": \"<string>\",\n  \"tag_audio_events\": true,\n  \"cloud_storage_url\": \"<string>\",\n  \"use_multi_channel\": true,\n  \"diarization_threshold\": 123,\n  \"timestamps_granularity\": \"<string>\"\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Content-Type", "<content-type>")
	req.Header.Add("Authorization", "<authorization>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.highwayapi.ai/v3/elevenlabs-scribe-v2")
  .header("Content-Type", "<content-type>")
  .header("Authorization", "<authorization>")
  .body("{\n  \"seed\": 123,\n  \"diarize\": true,\n  \"file_format\": \"<string>\",\n  \"temperature\": 123,\n  \"num_speakers\": 123,\n  \"language_code\": \"<string>\",\n  \"tag_audio_events\": true,\n  \"cloud_storage_url\": \"<string>\",\n  \"use_multi_channel\": true,\n  \"diarization_threshold\": 123,\n  \"timestamps_granularity\": \"<string>\"\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.highwayapi.ai/v3/elevenlabs-scribe-v2")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Content-Type"] = '<content-type>'
request["Authorization"] = '<authorization>'
request.body = "{\n  \"seed\": 123,\n  \"diarize\": true,\n  \"file_format\": \"<string>\",\n  \"temperature\": 123,\n  \"num_speakers\": 123,\n  \"language_code\": \"<string>\",\n  \"tag_audio_events\": true,\n  \"cloud_storage_url\": \"<string>\",\n  \"use_multi_channel\": true,\n  \"diarization_threshold\": 123,\n  \"timestamps_granularity\": \"<string>\"\n}"

response = http.request(request)
puts response.read_body

POST

elevenlabs-scribe-v2

ElevenLabs Speech-to-Text V2

curl --request POST \
  --url https://api.highwayapi.ai/v3/elevenlabs-scribe-v2 \
  --header 'Authorization: <authorization>' \
  --header 'Content-Type: <content-type>' \
  --data '
{
  "seed": 123,
  "diarize": true,
  "file_format": "<string>",
  "temperature": 123,
  "num_speakers": 123,
  "language_code": "<string>",
  "tag_audio_events": true,
  "cloud_storage_url": "<string>",
  "use_multi_channel": true,
  "diarization_threshold": 123,
  "timestamps_granularity": "<string>"
}
'

import requests

url = "https://api.highwayapi.ai/v3/elevenlabs-scribe-v2"

payload = {
    "seed": 123,
    "diarize": True,
    "file_format": "<string>",
    "temperature": 123,
    "num_speakers": 123,
    "language_code": "<string>",
    "tag_audio_events": True,
    "cloud_storage_url": "<string>",
    "use_multi_channel": True,
    "diarization_threshold": 123,
    "timestamps_granularity": "<string>"
}
headers = {
    "Content-Type": "<content-type>",
    "Authorization": "<authorization>"
}

response = requests.post(url, json=payload, headers=headers)

print(response.text)

const options = {
  method: 'POST',
  headers: {'Content-Type': '<content-type>', Authorization: '<authorization>'},
  body: JSON.stringify({
    seed: 123,
    diarize: true,
    file_format: '<string>',
    temperature: 123,
    num_speakers: 123,
    language_code: '<string>',
    tag_audio_events: true,
    cloud_storage_url: '<string>',
    use_multi_channel: true,
    diarization_threshold: 123,
    timestamps_granularity: '<string>'
  })
};

fetch('https://api.highwayapi.ai/v3/elevenlabs-scribe-v2', options)
  .then(res => res.json())
  .then(res => console.log(res))
  .catch(err => console.error(err));

<?php

$curl = curl_init();

curl_setopt_array($curl, [
  CURLOPT_URL => "https://api.highwayapi.ai/v3/elevenlabs-scribe-v2",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_ENCODING => "",
  CURLOPT_MAXREDIRS => 10,
  CURLOPT_TIMEOUT => 30,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => json_encode([
    'seed' => 123,
    'diarize' => true,
    'file_format' => '<string>',
    'temperature' => 123,
    'num_speakers' => 123,
    'language_code' => '<string>',
    'tag_audio_events' => true,
    'cloud_storage_url' => '<string>',
    'use_multi_channel' => true,
    'diarization_threshold' => 123,
    'timestamps_granularity' => '<string>'
  ]),
  CURLOPT_HTTPHEADER => [
    "Authorization: <authorization>",
    "Content-Type: <content-type>"
  ],
]);

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

package main

import (
	"fmt"
	"strings"
	"net/http"
	"io"
)

func main() {

	url := "https://api.highwayapi.ai/v3/elevenlabs-scribe-v2"

	payload := strings.NewReader("{\n  \"seed\": 123,\n  \"diarize\": true,\n  \"file_format\": \"<string>\",\n  \"temperature\": 123,\n  \"num_speakers\": 123,\n  \"language_code\": \"<string>\",\n  \"tag_audio_events\": true,\n  \"cloud_storage_url\": \"<string>\",\n  \"use_multi_channel\": true,\n  \"diarization_threshold\": 123,\n  \"timestamps_granularity\": \"<string>\"\n}")

	req, _ := http.NewRequest("POST", url, payload)

	req.Header.Add("Content-Type", "<content-type>")
	req.Header.Add("Authorization", "<authorization>")

	res, _ := http.DefaultClient.Do(req)

	defer res.Body.Close()
	body, _ := io.ReadAll(res.Body)

	fmt.Println(string(body))

}

HttpResponse<String> response = Unirest.post("https://api.highwayapi.ai/v3/elevenlabs-scribe-v2")
  .header("Content-Type", "<content-type>")
  .header("Authorization", "<authorization>")
  .body("{\n  \"seed\": 123,\n  \"diarize\": true,\n  \"file_format\": \"<string>\",\n  \"temperature\": 123,\n  \"num_speakers\": 123,\n  \"language_code\": \"<string>\",\n  \"tag_audio_events\": true,\n  \"cloud_storage_url\": \"<string>\",\n  \"use_multi_channel\": true,\n  \"diarization_threshold\": 123,\n  \"timestamps_granularity\": \"<string>\"\n}")
  .asString();

require 'uri'
require 'net/http'

url = URI("https://api.highwayapi.ai/v3/elevenlabs-scribe-v2")

http = Net::HTTP.new(url.host, url.port)
http.use_ssl = true

request = Net::HTTP::Post.new(url)
request["Content-Type"] = '<content-type>'
request["Authorization"] = '<authorization>'
request.body = "{\n  \"seed\": 123,\n  \"diarize\": true,\n  \"file_format\": \"<string>\",\n  \"temperature\": 123,\n  \"num_speakers\": 123,\n  \"language_code\": \"<string>\",\n  \"tag_audio_events\": true,\n  \"cloud_storage_url\": \"<string>\",\n  \"use_multi_channel\": true,\n  \"diarization_threshold\": 123,\n  \"timestamps_granularity\": \"<string>\"\n}"

response = http.request(request)
puts response.read_body

Transcribe audio or video files. When use_multi_channel is true and the uploaded audio has multiple channels, a ‘transcripts’ object is returned, with one transcription per channel. Otherwise, a single transcription result is returned.

Request Headers

string

required

Enum value: application/json

string

required

Bearer authentication format: Bearer {{API Key}}.

Request Body

integer

If specified, the system will do its best to sample deterministically. Requests with the same seed and parameters should return the same result, but absolute determinism is not guaranteed. Must be an integer between 0 and 2147483647.Range: [0, 2147483647]

boolean

default:false

Whether to label the current speaker in the uploaded file.

string

default:"other"

Input audio format. Options are ‘pcm_s16le_16’ or ‘other’. pcm_s16le_16 requires the audio to be 16 kHz sample rate, 16-bit integer, mono, little-endian format, and has lower latency than encoded waveforms.Allowed values: pcm_s16le_16, other

number

Controls the randomness of the transcription output. The range is 0.0 to 2.0; higher values produce more diverse and less deterministic results. If omitted, the default temperature of the selected model will be used (usually 0).Range: [0, 2]

integer

The maximum number of speakers in the uploaded file. This can be used to help distinguish speakers, with support for up to 32 speakers.Range: [1, 32]

string

Specify the ISO-639-1 or ISO-639-3 language code of the audio file. Providing it in advance can sometimes improve transcription performance. The default is null, which automatically detects the language.

boolean

default:true

Whether to mark audio events such as (laughter) and (footsteps) in the transcription.

string

required

HTTPS link to the file to be transcribed. Exactly one of file and cloud_storage_url must be provided. The file must be accessible via HTTPS and smaller than 2 GB. Any valid HTTPS address is supported, including cloud storage (AWS S3, GCS, Cloudflare R2, etc.), CDNs, or other HTTPS sources. Presigned links with tokens or URL query parameter authentication are supported.

boolean

default:false

Whether the audio file is multi-channel and each channel contains only a single speaker. When enabled, each channel will be transcribed independently and the results will be merged. Each word in the output contains a channel_index field. Up to 5 channels are supported.

number

Diarization threshold. With a higher value, one person is less likely to be split into multiple speakers, but different people are more likely to be merged into one speaker (fewer identified speakers). With a lower value, one person is more likely to be split into multiple speakers, but different people are less likely to be merged into one speaker (more speakers). Can only be set when diarize=True and num_speakers=None. The default is None, which selects a threshold based on the model id (usually 0.22).Range: [0.1, 0.4]

string

default:"word"

The granularity of timestamps in the transcription. ‘word’ provides word-level timestamps, while ‘character’ provides timestamps for each character.Allowed values: none, word, character

Response Information

The response may be one of the following response types:

Response Type 1

string

required

The raw transcribed text.

object[]

required

A list of words and their timing information.

Hide properties

number

The end time of the word or sound in the audio, in seconds.

string

required

The transcribed word or sound content.

string

required

The type of this word or sound. ‘audio_event’ is used for non-word sounds, such as laughter or footsteps.Allowed values: word, spacing, audio_event

number

The start time of the word or sound in the audio, in seconds.

number

required

The log probability when predicting this word. The logprob range is [-infinity, 0]; a higher value indicates the model is more confident in its prediction.

object[]

The characters that make up the word and their corresponding timing information.

Hide properties

number

The end time of the character in the audio, in seconds.

string

required

The transcribed character content.

number

The start time of the character in the audio, in seconds.

string

The unique identifier of the speaker corresponding to this word.

integer

The channel index corresponding to this transcription (valid for multi-channel audio).

string

required

The detected language code (for example, ‘eng’ for English).

string

The unique transcription ID for this response.

number

required

The confidence of language detection (between 0 and 1).

Response Type 2

object[]

required

A list of transcriptions corresponding to each audio channel. Each transcription contains the text for its channel and word-level details.

Hide properties

string

required

The raw transcribed text.

object[]

required

A list of words and their timing information.

Hide properties

number

The end time of the word or sound in the audio, in seconds.

string

required

The transcribed word or sound content.

string

required

The type of this word or sound. ‘audio_event’ is used for non-word sounds, such as laughter or footsteps.Allowed values: word, spacing, audio_event

number

The start time of the word or sound in the audio, in seconds.

number

required

The log probability when predicting this word. The logprob range is [-infinity, 0]; a higher value indicates the model is more confident in its prediction.

object[]

The characters that make up the word and their corresponding timing information.

Hide properties

number

The end time of the character in the audio, in seconds.

string

required

The transcribed character content.

number

The start time of the character in the audio, in seconds.

string

The unique identifier of the speaker corresponding to this word.

integer

The channel index corresponding to this transcription (valid for multi-channel audio).

string

required

The detected language code (for example, ‘eng’ for English).

string

The unique transcription ID for this response.

number

required

The confidence of language detection (between 0 and 1).

string

The unique transcription ID for this response.

ElevenLabs Speech to Text V1

ElevenLabs Text-to-Speech Flash V2

API Basics

Large Language Models

Images

Video

Audio

ElevenLabs Speech-to-Text V2

Request Headers

Request Body

Response Information

​Request Headers

​Request Body

​Response Information

Request Headers

Request Body

Response Information