Skip to content

mistral_common.tokens.tokenizers.audio

Audio(audio_array, sampling_rate, format)

Parameters:

Name Type Description Default
audio_array ndarray

The audio data as a numpy array.

required
sampling_rate int

The sampling rate of the audio in Hz.

required
format str

The format of the audio file.

required
Source code in src/mistral_common/tokens/tokenizers/audio.py
def __init__(self, audio_array: np.ndarray, sampling_rate: int, format: str) -> None:
    r"""Initialize an Audio instance with audio data, sampling rate, and format.

    Args:
        audio_array: The audio data as a numpy array.
        sampling_rate: The sampling rate of the audio in Hz.
        format: The format of the audio file.
    """
    self.audio_array = audio_array
    self.sampling_rate = sampling_rate
    self.format = format
    self._check_valid()

duration property

Calculate the duration of the audio in seconds.

Returns:

Type Description
float

The duration of the audio in seconds.

from_audio_chunk(chunk) staticmethod

Create an Audio instance from an AudioChunk.

Parameters:

Name Type Description Default
chunk AudioChunk

An AudioChunk with input_audio as str (base64) or bytes.

required

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/tokens/tokenizers/audio.py
@staticmethod
def from_audio_chunk(chunk: "AudioChunk") -> "Audio":
    r"""Create an Audio instance from an AudioChunk.

    Args:
        chunk: An AudioChunk with input_audio as str (base64) or bytes.

    Returns:
        An instance of the Audio class.
    """
    input_audio = chunk.input_audio
    if isinstance(input_audio, bytes):
        return Audio.from_bytes(input_audio)
    elif isinstance(input_audio, str):
        return Audio.from_base64(input_audio)
    else:
        raise ValueError(f"Unsupported input_audio type: {type(input_audio)}")

from_base64(audio_base64, strict=True) staticmethod

Create an Audio instance from a base64 encoded string.

Parameters:

Name Type Description Default
audio_base64 str

The base64 encoded audio data.

required
strict bool

Whether to strictly enforce mono audio. Defaults to True.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/tokens/tokenizers/audio.py
@staticmethod
def from_base64(audio_base64: str, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from a base64 encoded string.

    Args:
        audio_base64: The base64 encoded audio data.
        strict: Whether to strictly enforce mono audio. Defaults to True.

    Returns:
        An instance of the Audio class.
    """
    assert_soundfile_installed()

    if re.match(r"^data:audio/\w+;base64,", audio_base64):
        audio_base64 = audio_base64.split(",")[1]

    try:
        audio_bytes = base64.b64decode(audio_base64)
    except Exception as e:
        raise ValueError("base64 decoding failed. Please check the input string is a valid base64.") from e

    return Audio.from_bytes(audio_bytes, strict=strict)

from_bytes(audio_bytes, strict=True) staticmethod

Create an Audio instance from bytes.

Parameters:

Name Type Description Default
audio_bytes bytes

The audio data as bytes.

required
strict bool

Whether to strictly enforce mono audio. Defaults to True.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/tokens/tokenizers/audio.py
@staticmethod
def from_bytes(audio_bytes: bytes, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from bytes.

    Args:
        audio_bytes: The audio data as bytes.
        strict: Whether to strictly enforce mono audio. Defaults to True.

    Returns:
        An instance of the Audio class.
    """
    assert_soundfile_installed()

    with io.BytesIO(audio_bytes) as audio_file:
        with sf.SoundFile(audio_file) as f:
            audio_array = f.read(dtype="float32")
            sampling_rate = f.samplerate
            audio_format = f.format

    format_enum = AudioFormat(audio_format)
    format = format_enum.value.lower()

    if audio_array.ndim != 1:
        if strict:
            raise ValueError(f"{audio_array.ndim=}")
        else:
            audio_array = audio_array.mean(axis=1)

    return Audio(audio_array=audio_array, sampling_rate=sampling_rate, format=format)

from_file(file, strict=True) staticmethod

Create an Audio instance from an audio file.

Parameters:

Name Type Description Default
file str

Path to the audio file.

required
strict bool

Whether to strictly enforce mono audio. Defaults to True.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/tokens/tokenizers/audio.py
@staticmethod
def from_file(file: str, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from an audio file.

    Args:
        file: Path to the audio file.
        strict: Whether to strictly enforce mono audio. Defaults to True.

    Returns:
        An instance of the Audio class.
    """
    assert_soundfile_installed()

    if isinstance(file, str) and file.startswith("file://"):
        file = file[7:]

    if not Path(file).exists():
        raise FileNotFoundError(f"{file=} does not exist")

    with open(file, "rb") as f:
        audio_bytes = f.read()

    return Audio.from_bytes(audio_bytes, strict=strict)

from_raw_audio(audio) staticmethod

Create an Audio instance from a RawAudio object.

Deprecated: Use from_base64() or from_bytes() instead. Will be removed in 1.13.0.

Parameters:

Name Type Description Default
audio RawAudio

The RawAudio object containing audio data.

required

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/tokens/tokenizers/audio.py
@staticmethod
def from_raw_audio(audio: "RawAudio") -> "Audio":
    r"""Create an Audio instance from a RawAudio object.

    Deprecated: Use `from_base64()` or `from_bytes()` instead. Will be removed in 1.13.0.

    Args:
        audio: The RawAudio object containing audio data.

    Returns:
        An instance of the Audio class.
    """

    warn_once(
        "Audio.from_raw_audio",
        "Audio.from_raw_audio() is deprecated. "
        "Use Audio.from_base64() or Audio.from_bytes() instead. "
        "Will be removed in 1.13.0.",
        DeprecationWarning,
        stacklevel=2,
    )
    if isinstance(audio.data, bytes):
        return Audio.from_bytes(audio.data)
    elif isinstance(audio.data, str):
        return Audio.from_base64(audio.data)
    else:
        raise ValueError(f"Unsupported audio data type: {type(audio.data)}")

from_url(url, strict=True) staticmethod

Create an Audio instance from a URL.

Parameters:

Name Type Description Default
url str

The URL of the audio file.

required
strict bool

Whether to strictly enforce mono audio.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/tokens/tokenizers/audio.py
@staticmethod
def from_url(url: str, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from a URL.

    Args:
        url: The URL of the audio file.
        strict: Whether to strictly enforce mono audio.

    Returns:
        An instance of the Audio class.
    """
    try:
        response = _requests_lib.get(url)
        response.raise_for_status()
        return Audio.from_bytes(response.content, strict=strict)
    except _requests_lib.RequestException as e:  # Something went wrong with the request.
        raise ValueError(f"Failed to download audio from URL: {url}") from e
    except Exception as e:  # Something went wrong with the audio file.
        raise ValueError(f"Failed to create Audio instance from URL: {url} .") from e

resample(new_sampling_rate)

Resample audio data to a new sampling rate.

Parameters:

Name Type Description Default
new_sampling_rate int

The new sampling rate to resample the audio to.

required
Source code in src/mistral_common/tokens/tokenizers/audio.py
def resample(self, new_sampling_rate: int) -> None:
    r"""Resample audio data to a new sampling rate.

    Args:
        new_sampling_rate: The new sampling rate to resample the audio to.
    """
    if self.sampling_rate == new_sampling_rate:
        return

    assert_soxr_installed()

    self.audio_array = soxr.resample(self.audio_array, self.sampling_rate, new_sampling_rate, quality="HQ")
    self.sampling_rate = new_sampling_rate

to_base64(format, prefix=False)

Convert the audio data to a base64 encoded string.

Parameters:

Name Type Description Default
format str

The format to encode the audio in.

required
prefix bool

Whether to add a data prefix to the base64 encoded string.

False

Returns:

Type Description
str

The base64 encoded audio data.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def to_base64(self, format: str, prefix: bool = False) -> str:
    r"""Convert the audio data to a base64 encoded string.

    Args:
        format: The format to encode the audio in.
        prefix: Whether to add a data prefix to the base64 encoded string.

    Returns:
        The base64 encoded audio data.
    """
    assert_soundfile_installed()

    assert format in EXPECTED_FORMAT_VALUES, f"{format=} not in {EXPECTED_FORMAT_VALUES=}"

    with io.BytesIO() as audio_file:
        sf.write(audio_file, self.audio_array, self.sampling_rate, format=format.upper())
        audio_file.seek(0)
        base64_str = base64.b64encode(audio_file.read()).decode("utf-8")
    if prefix:
        base64_str = f"data:audio/{format.lower()};base64,{base64_str}"
    return base64_str

AudioConfig(sampling_rate, frame_rate, encoding_config, chunk_length_s=None, transcription_format=TranscriptionFormat.INSTRUCT, transcription_delay_ms=None, streaming_look_ahead_ms=None, streaming_look_back_ms=None, streaming_n_left_pad_tokens=None, voice_num_audio_tokens=None) dataclass

Configuration for audio processing.

Attributes:

Name Type Description
sampling_rate int

Sampling rate of the audio.

frame_rate float

Number of frames per second accepted by the tokenizer model.

encoding_config AudioSpectrogramConfig

Configuration for audio spectrogram.

chunk_length_s float | None

Whether to pad an audio into multiples of chunk_length_s seconds (optional).

voice_num_audio_tokens dict[str, int] | None

Mapping from speaker voice name to number of audio tokens for that speaker's reference audio (optional, only for TTS).

audio_length_per_tok property

Calculate the length of audio per token.

chunk_frames property

Calculate the number of frames per chunk.

AudioEncoder(audio_config, special_ids)

Encodes audio chunks into a format suitable for further processing.

Attributes:

Name Type Description
audio_config

Configuration for audio processing.

encoding_config

Configuration for audio spectrogram.

special_ids

Special tokens for audio encoding.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def __init__(self, audio_config: AudioConfig, special_ids: SpecialAudioIDs) -> None:
    self.audio_config = audio_config
    self.encoding_config = audio_config.encoding_config
    self.special_ids = special_ids

audio_to_text_token property

Get the audio_to_text token.

audio_token property

Get the audio token.

begin_audio_token property

Get the begin audio token.

streaming_pad property

Get the streaming pad token.

text_to_audio_token property

Get the text_to_audio token.

__call__(content)

Call the encoder on an audio chunk or URL chunk.

Parameters:

Name Type Description Default
content AudioChunk | AudioURLChunk

Audio or URL chunk to encode.

required

Returns:

Type Description
AudioEncoding

Encoded audio data and tokens.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def __call__(self, content: AudioChunk | AudioURLChunk) -> AudioEncoding:
    r"""Call the encoder on an audio chunk or URL chunk.

    Args:
        content: Audio or URL chunk to encode.

    Returns:
        Encoded audio data and tokens.
    """
    if isinstance(content, AudioURLChunk):
        return self._encode_audio_url_chunk(content)
    elif isinstance(content, AudioChunk):
        return self._encode_audio_chunk(content)
    else:
        raise ValueError(f"Unsupported content type: {type(content)}")

encode_audio(audio, transcription_delay_ms=None)

Encode an audio optionally with transcription delay.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def encode_audio(self, audio: Audio, transcription_delay_ms: float | None = None) -> AudioEncoding:
    r"""Encode an audio optionally with transcription delay."""
    audio.resample(self.audio_config.sampling_rate)
    audio.audio_array = self.pad(audio.audio_array, self.audio_config.sampling_rate, transcription_delay_ms)

    if self.audio_config.transcription_format == TranscriptionFormat.STREAMING:
        tokens = self.encode_streaming_tokens(transcription_delay_ms)
    else:
        tokens = self._encode_audio_tokens(audio.audio_array.shape[0])

    return AudioEncoding(
        tokens=tokens,
        audio=audio,
    )

encode_audio_for_speech_request(audio, voice)

Encode audio or voice preset into an AudioEncoding for speech synthesis.

Either audio (reference audio for voice cloning) or voice (preset name) must be provided. When audio is given it takes precedence.

Parameters:

Name Type Description Default
audio Audio | None

Reference audio waveform, or None to use a voice preset.

required
voice str | None

Preset voice name (e.g. 'Neutral Male', 'Neutral Female'), or None when using ref audio.

required

Returns:

Type Description
AudioEncoding

AudioEncoding containing the token sequence and optional audio data.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def encode_audio_for_speech_request(self, audio: Audio | None, voice: str | None) -> AudioEncoding:
    r"""Encode audio or voice preset into an AudioEncoding for speech synthesis.

    Either `audio` (reference audio for voice cloning) or `voice` (preset name)
    must be provided. When `audio` is given it takes precedence.

    Args:
        audio: Reference audio waveform, or None to use a voice preset.
        voice: Preset voice name (e.g. 'Neutral Male', 'Neutral Female'), or None when using ref audio.

    Returns:
        AudioEncoding containing the token sequence and optional audio data.
    """
    assert audio is not None or voice is not None, (
        f"Either audio or voice must be defined to encode audio, got {audio=} and {voice=}"
    )

    if audio is not None:
        audio.resample(self.audio_config.sampling_rate)
        num_audio_tokens = self._get_num_audio_token_for_speech_request(len(audio.audio_array))
    else:
        assert self.audio_config.voice_num_audio_tokens is not None, (
            "voice_num_audio_tokens must be set in audio config to use voice-based speech requests"
        )
        assert voice is not None and voice in self.audio_config.voice_num_audio_tokens, (
            f"Unknown voice {voice!r}, expected one of {list(self.audio_config.voice_num_audio_tokens)}"
        )
        num_audio_tokens = self.audio_config.voice_num_audio_tokens[voice]
    tokens = self._encode_audio_tokens_for_speech_request(num_audio_tokens)

    return AudioEncoding(
        tokens=tokens,
        audio=audio,
    )

encode_streaming_tokens(transcription_delay_ms=None)

Encode the streaming tokens given a transcription delay.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def encode_streaming_tokens(self, transcription_delay_ms: float | None = None) -> list[int]:
    r"""Encode the streaming tokens given a transcription delay."""
    assert isinstance(self.audio_config.encoding_config, AudioSpectrogramConfig), (
        f"Audio encoder must be spectrogram encoder, got {self.audio_config.encoding_config=}"
    )
    assert self.audio_config.transcription_delay_ms is not None

    # streaming pad tokens consist of silence we pad on left + delay tokens
    stream_pad_prefix_len = self.audio_config.n_left_pad_tokens + self.audio_config.get_num_delay_tokens(
        transcription_delay_ms
    )
    tokens = [self.streaming_pad] * stream_pad_prefix_len

    return tokens

get_padding_audio(transcription_delay_ms=None)

Gets left and right padding for realtime audio models.

Parameters:

Name Type Description Default
transcription_delay_ms optional

Delay in milliseconds for transcription.

None

Returns:

Type Description
tuple[Audio, Audio]

Tuple of left and right padding for realtime audio models.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def get_padding_audio(self, transcription_delay_ms: float | None = None) -> tuple[Audio, Audio]:
    r"""Gets left and right padding for realtime audio models.

    Args:
        transcription_delay_ms (optional): Delay in milliseconds for transcription.

    Returns:
        Tuple of left and right padding for realtime audio models.
    """

    left_pad, right_pad = self._get_streaming_pad(0, transcription_delay_ms)
    left_pad_audio = Audio(
        audio_array=np.zeros(left_pad, dtype=np.float32),
        sampling_rate=self.audio_config.sampling_rate,
        format="wav",
    )
    right_pad_audio = Audio(
        audio_array=np.zeros(right_pad, dtype=np.float32),
        sampling_rate=self.audio_config.sampling_rate,
        format="wav",
    )
    return left_pad_audio, right_pad_audio

next_multiple_of_chunk_frames(audio_array_len, sampling_rate)

Calculate the next multiple of chunk frames.

Parameters:

Name Type Description Default
audio_array_len int

Length of the audio array.

required
sampling_rate int

Sampling rate of the audio.

required

Returns:

Type Description
int

The next multiple of chunk frames.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def next_multiple_of_chunk_frames(self, audio_array_len: int, sampling_rate: int) -> int:
    r"""Calculate the next multiple of chunk frames.

    Args:
        audio_array_len: Length of the audio array.
        sampling_rate: Sampling rate of the audio.

    Returns:
        The next multiple of chunk frames.
    """
    assert sampling_rate == self.audio_config.sampling_rate, (
        f"Expected {sampling_rate=} to be {self.audio_config.sampling_rate=}"
    )
    assert self.audio_config.chunk_length_s is not None, (
        f"Can't call next_multiple_of_chunk_frames if {self.audio_config.chunk_length_s=}."
    )

    return math.ceil(audio_array_len / self.audio_config.chunk_frames) * self.audio_config.chunk_frames

pad(audio_array, sampling_rate, transcription_delay_ms=None, **kwargs)

Pad the audio array to the desired length.

Parameters:

Name Type Description Default
audio_array ndarray

Audio data as a numpy array.

required
sampling_rate int

Sampling rate of the audio.

required
transcription_delay_ms optional

Delay in milliseconds for transcription.

None

Returns:

Type Description
ndarray

Padded audio array.

Source code in src/mistral_common/tokens/tokenizers/audio.py
def pad(
    self,
    audio_array: np.ndarray,
    sampling_rate: int,
    transcription_delay_ms: float | None = None,
    **kwargs: Any,
) -> np.ndarray:
    r"""Pad the audio array to the desired length.

    Args:
        audio_array: Audio data as a numpy array.
        sampling_rate: Sampling rate of the audio.
        transcription_delay_ms (optional): Delay in milliseconds for transcription.

    Returns:
        Padded audio array.
    """
    # TODO(Patrick) - remove **kwargs as it's just there to swallow deprecated
    # keyword args from voxtral_realtime in vLLM. It was
    # relevant for the release. Remove in mistral_common version 1.13.0
    if self.audio_config.chunk_length_s:
        next_multiple_of_chunk_frames = self.next_multiple_of_chunk_frames(audio_array.shape[-1], sampling_rate)
        audio_array = np.pad(audio_array, (0, next_multiple_of_chunk_frames - audio_array.shape[-1]))
    elif self.audio_config.is_streaming:
        left_pad, right_pad = self._get_streaming_pad(audio_array.shape[-1], transcription_delay_ms)
        # we pad both left & right as this leads to better performance
        audio_array = np.pad(audio_array, (left_pad, right_pad))
    elif (
        isinstance(self.encoding_config, AudioSpectrogramConfig)
        and audio_array.shape[-1] < self.encoding_config.window_size
    ):
        # minimum length for audios is at least one spectrogram frame
        audio_array = np.pad(audio_array, (0, self.encoding_config.window_size - audio_array.shape[-1]))

    return audio_array

AudioEncoding(tokens, audio) dataclass

Encapsulates the tokens and audio data for an audio chunk.

Attributes:

Name Type Description
tokens list[int]

Text tokens corresponding to this audio chunk.

audio Audio | None

Original audio waveform data, or None when using a preset voice (no reference audio to forward to the model).

AudioFormat

Bases: Enum

Dynamic enum whose members depend on soundfile availability at runtime.

AudioSpectrogramConfig(num_mel_bins, hop_length, window_size) dataclass

Configuration for generating an audio spectrogram.

Attributes:

Name Type Description
num_mel_bins int

Number of mel bins, typically 80 or 128.

hop_length int

Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients, typically 160.

window_size int

Window size of the Fourier transform, typically 400.

SpecialAudioIDs(audio, begin_audio, streaming_pad, text_to_audio, audio_to_text) dataclass

Special text tokens corresponding to audio token sequence.

Attributes:

Name Type Description
audio int | None

Token representing audio.

begin_audio int | None

Token representing the beginning of audio.

streaming_pad int | None

Token representing streaming pad of audio. Only relevant for steaming models.

text_to_audio int | None

Token representing intent to convert text to audio.

audio_to_text int | None

Token representing intent to convert audio to text.

TranscriptionFormat

Bases: str, Enum

Transcription format.

Should be set by the tokenizer for correct encoding.

Attributes: - INSTRUCT: The instruct format. - STREAMING: The streaming format.