Skip to content

mistral_common.audio

Audio(audio_array, sampling_rate, format)

Parameters:

Name Type Description Default
audio_array ndarray

The audio data as a numpy array.

required
sampling_rate int

The sampling rate of the audio in Hz.

required
format str

The format of the audio file.

required
Source code in src/mistral_common/audio.py
def __init__(self, audio_array: np.ndarray, sampling_rate: int, format: str) -> None:
    r"""Initialize an Audio instance with audio data, sampling rate, and format.

    Args:
        audio_array: The audio data as a numpy array.
        sampling_rate: The sampling rate of the audio in Hz.
        format: The format of the audio file.
    """
    self.audio_array = audio_array
    self.sampling_rate = sampling_rate
    self.format = format
    self._check_valid()

duration property

Calculate the duration of the audio in seconds.

Returns:

Type Description
float

The duration of the audio in seconds.

from_base64(audio_base64, strict=True) staticmethod

Create an Audio instance from a base64 encoded string.

Parameters:

Name Type Description Default
audio_base64 str

The base64 encoded audio data.

required
strict bool

Whether to strictly enforce mono audio. Defaults to True.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/audio.py
@staticmethod
def from_base64(audio_base64: str, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from a base64 encoded string.

    Args:
        audio_base64: The base64 encoded audio data.
        strict: Whether to strictly enforce mono audio. Defaults to True.

    Returns:
        An instance of the Audio class.
    """
    if not is_soundfile_installed():
        raise ImportError(
            "soundfile is required for this function. Install it with 'pip install mistral-common[soundfile]'"
        )

    if re.match(r"^data:audio/\w+;base64,", audio_base64):  # Remove the prefix if it exists
        audio_base64 = audio_base64.split(",")[1]

    try:
        audio_bytes = base64.b64decode(audio_base64)
    except Exception as e:
        raise ValueError("base64 decoding failed. Please check the input string is a valid base64.") from e

    return Audio.from_bytes(audio_bytes, strict=strict)

from_bytes(audio_bytes, strict=True) staticmethod

Create an Audio instance from bytes.

Parameters:

Name Type Description Default
audio_bytes bytes

The audio data as bytes.

required
strict bool

Whether to strictly enforce mono audio. Defaults to True.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/audio.py
@staticmethod
def from_bytes(audio_bytes: bytes, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from bytes.

    Args:
        audio_bytes: The audio data as bytes.
        strict: Whether to strictly enforce mono audio. Defaults to True.

    Returns:
        An instance of the Audio class.
    """
    # Read the bytes into an audio file.
    with io.BytesIO(audio_bytes) as audio_file:
        with sf.SoundFile(audio_file) as f:
            # Read the entire audio data
            audio_array = f.read(dtype="float32")
            sampling_rate = f.samplerate
            audio_format = f.format

    format_enum = AudioFormat(audio_format)
    format = format_enum.value.lower()

    if audio_array.ndim != 1:
        if strict:
            raise ValueError(f"{audio_array.ndim=}")
        else:
            audio_array = audio_array.mean(axis=1)

    return Audio(audio_array=audio_array, sampling_rate=sampling_rate, format=format)

from_file(file, strict=True) staticmethod

Create an Audio instance from an audio file.

Parameters:

Name Type Description Default
file str

Path to the audio file.

required
strict bool

Whether to strictly enforce mono audio. Defaults to True.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/audio.py
@staticmethod
def from_file(file: str, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from an audio file.

    Args:
        file: Path to the audio file.
        strict: Whether to strictly enforce mono audio. Defaults to True.

    Returns:
        An instance of the Audio class.
    """
    if not is_soundfile_installed():
        raise ImportError(
            "soundfile is required for this function. Install it with 'pip install mistral-common[soundfile]'"
        )

    if isinstance(file, str) and file.startswith("file://"):
        file = file[7:]

    if not Path(file).exists():
        raise FileNotFoundError(f"{file=} does not exist")

    with open(file, "rb") as f:
        audio_bytes = f.read()

    return Audio.from_bytes(audio_bytes, strict=strict)

from_raw_audio(audio) staticmethod

Create an Audio instance from a RawAudio object.

Parameters:

Name Type Description Default
audio RawAudio

The RawAudio object containing audio data.

required

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/audio.py
@staticmethod
def from_raw_audio(audio: "RawAudio") -> "Audio":
    r"""Create an Audio instance from a RawAudio object.

    Args:
        audio: The RawAudio object containing audio data.

    Returns:
        An instance of the Audio class.
    """
    if isinstance(audio.data, bytes):
        return Audio.from_bytes(audio.data)
    elif isinstance(audio.data, str):
        return Audio.from_base64(audio.data)
    else:
        raise ValueError(f"Unsupported audio data type: {type(audio.data)}")

from_url(url, strict=True) staticmethod

Create an Audio instance from a URL.

Parameters:

Name Type Description Default
url str

The URL of the audio file.

required
strict bool

Whether to strictly enforce mono audio.

True

Returns:

Type Description
Audio

An instance of the Audio class.

Source code in src/mistral_common/audio.py
@staticmethod
def from_url(url: str, strict: bool = True) -> "Audio":
    r"""Create an Audio instance from a URL.

    Args:
        url: The URL of the audio file.
        strict: Whether to strictly enforce mono audio.

    Returns:
        An instance of the Audio class.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        return Audio.from_bytes(response.content, strict=strict)
    except requests.RequestException as e:  # Something went wrong with the request.
        raise ValueError(f"Failed to download audio from URL: {url}") from e
    except Exception as e:  # Something went wrong with the audio file.
        raise ValueError(f"Failed to create Audio instance from URL: {url} .") from e

resample(new_sampling_rate)

Resample audio data to a new sampling rate.

Parameters:

Name Type Description Default
new_sampling_rate int

The new sampling rate to resample the audio to.

required
Source code in src/mistral_common/audio.py
def resample(self, new_sampling_rate: int) -> None:
    r"""Resample audio data to a new sampling rate.

    Args:
        new_sampling_rate: The new sampling rate to resample the audio to.
    """
    if self.sampling_rate == new_sampling_rate:
        return

    if not is_soxr_installed():
        raise ImportError("soxr is required for this function. Install it with 'pip install mistral-common[soxr]'")

    self.audio_array = soxr.resample(self.audio_array, self.sampling_rate, new_sampling_rate, quality="HQ")
    self.sampling_rate = new_sampling_rate

to_base64(format, prefix=False)

Convert the audio data to a base64 encoded string.

Parameters:

Name Type Description Default
format str

The format to encode the audio in.

required
prefix bool

Whether to add a data prefix to the base64 encoded string.

False

Returns:

Type Description
str

The base64 encoded audio data.

Source code in src/mistral_common/audio.py
def to_base64(self, format: str, prefix: bool = False) -> str:
    r"""Convert the audio data to a base64 encoded string.

    Args:
        format: The format to encode the audio in.
        prefix: Whether to add a data prefix to the base64 encoded string.

    Returns:
        The base64 encoded audio data.
    """
    if not is_soundfile_installed():
        raise ImportError(
            "soundfile is required for this function. Install it with 'pip install mistral-common[soundfile]'"
        )

    assert format in EXPECTED_FORMAT_VALUES, f"{format=} not in {EXPECTED_FORMAT_VALUES=}"

    with io.BytesIO() as audio_file:
        sf.write(audio_file, self.audio_array, self.sampling_rate, format=format.upper())
        audio_file.seek(0)
        base64_str = base64.b64encode(audio_file.read()).decode("utf-8")
    if prefix:
        base64_str = f"data:audio/{format.lower()};base64,{base64_str}"
    return base64_str

hertz_to_mel(freq)

Convert frequency from hertz to mels using the "slaney" mel-scale.

Parameters:

Name Type Description Default
freq Union[float, ndarray]

The frequency, or multiple frequencies, in hertz (Hz).

required

Returns:

Type Description
Union[float, ndarray]

The frequencies on the mel scale.

Source code in src/mistral_common/audio.py
def hertz_to_mel(freq: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
    r"""Convert frequency from hertz to mels using the "slaney" mel-scale.

    Args:
        freq: The frequency, or multiple frequencies, in hertz (Hz).

    Returns:
        The frequencies on the mel scale.
    """
    min_log_hertz = 1000.0
    min_log_mel = 15.0
    logstep = 27.0 / np.log(6.4)
    mels = 3.0 * freq / 200.0

    if isinstance(freq, np.ndarray):
        assert isinstance(mels, np.ndarray), type(mels)
        log_region = freq >= min_log_hertz
        mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
    elif freq >= min_log_hertz:
        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep

    return mels

mel_filter_bank(num_frequency_bins, num_mel_bins, min_frequency, max_frequency, sampling_rate) cached

Create a Mel filter bank matrix for converting frequency bins to the Mel scale.

This function generates a filter bank matrix that can be used to transform a spectrum represented in frequency bins to the Mel scale. The Mel scale is a perceptual scale of pitches judged by listeners to be equal in distance from one another.

Parameters:

Name Type Description Default
num_frequency_bins int

The number of frequency bins in the input spectrum.

required
num_mel_bins int

The number of desired Mel bins in the output.

required
min_frequency float

The minimum frequency (in Hz) to consider.

required
max_frequency float

The maximum frequency (in Hz) to consider.

required
sampling_rate int

The sampling rate of the audio signal.

required

Returns:

Type Description
ndarray

A filter bank matrix of shape (num_mel_bins, num_frequency_bins)

ndarray

that can be used to project frequency bin energies onto Mel bins.

Source code in src/mistral_common/audio.py
@cache
def mel_filter_bank(
    num_frequency_bins: int,
    num_mel_bins: int,
    min_frequency: float,
    max_frequency: float,
    sampling_rate: int,
) -> np.ndarray:
    r"""Create a Mel filter bank matrix for converting frequency bins to the Mel scale.

    This function generates a filter bank matrix that can be used to transform a
    spectrum represented in frequency bins to the Mel scale. The Mel scale is a
    perceptual scale of pitches judged by listeners to be equal in distance from one another.

    Args:
        num_frequency_bins: The number of frequency bins in the input spectrum.
        num_mel_bins: The number of desired Mel bins in the output.
        min_frequency: The minimum frequency (in Hz) to consider.
        max_frequency: The maximum frequency (in Hz) to consider.
        sampling_rate: The sampling rate of the audio signal.

    Returns:
        A filter bank matrix of shape (num_mel_bins, num_frequency_bins)
        that can be used to project frequency bin energies onto Mel bins.
    """
    if num_frequency_bins < 2:
        raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2")

    if min_frequency > max_frequency:
        raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}")

    # center points of the triangular mel filters
    mel_min = hertz_to_mel(min_frequency)
    mel_max = hertz_to_mel(max_frequency)
    mel_freqs = np.linspace(mel_min, mel_max, num_mel_bins + 2)
    filter_freqs = mel_to_hertz(mel_freqs)

    # frequencies of FFT bins in Hz
    fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)

    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)

    # Slaney-style mel is scaled to be approx constant energy per channel
    enorm = 2.0 / (filter_freqs[2 : num_mel_bins + 2] - filter_freqs[:num_mel_bins])
    mel_filters *= np.expand_dims(enorm, 0)

    if (mel_filters.max(axis=0) == 0.0).any():
        raise ValueError(
            "At least one mel filter has all zero values. "
            f"The value for `num_mel_filters` ({num_mel_bins}) "
            "may be set too high. "
            "Or, the value for `num_frequency_bins` "
            f"({num_frequency_bins}) may be set too low."
        )
    return mel_filters

mel_to_hertz(mels)

Convert frequency from mels to hertz using the "slaney" mel-scale.

Parameters:

Name Type Description Default
mels ndarray

The frequency, or multiple frequencies, in mels.

required

Returns:

Type Description
ndarray

The frequencies in hertz.

Source code in src/mistral_common/audio.py
def mel_to_hertz(mels: np.ndarray) -> np.ndarray:
    r"""Convert frequency from mels to hertz using the "slaney" mel-scale.

    Args:
        mels: The frequency, or multiple frequencies, in mels.

    Returns:
        The frequencies in hertz.
    """
    min_log_hertz = 1000.0
    min_log_mel = 15.0
    logstep = np.log(6.4) / 27.0
    freq = 200.0 * mels / 3.0

    log_region = mels >= min_log_mel
    freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
    return freq