Skip to content

mistral_common.tokens.tokenizers.mistral

MistralTokenizer(instruct_tokenizer, validator, request_normalizer)

Bases: Generic[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, TokenizedType]

Mistral tokenizer.

This class is a wrapper around a InstructTokenizer, a MistralRequestValidator and a InstructRequestNormalizer.

It provides a convenient interface to tokenize, validate ad normalize Mistral requests.

Attributes:

Name Type Description
instruct_tokenizer InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType]

The instruct tokenizer to use. See InstructTokenizer.

Parameters:

Name Type Description Default
instruct_tokenizer InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType]

The instruct tokenizer to use.

required
validator MistralRequestValidator[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType]

The request validator to use.

required
request_normalizer InstructRequestNormalizer[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, InstructRequestType]

The request normalizer to use.

required
Source code in src/mistral_common/tokens/tokenizers/mistral.py
def __init__(
    self,
    instruct_tokenizer: InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType],
    validator: MistralRequestValidator[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType],
    request_normalizer: InstructRequestNormalizer[
        UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, InstructRequestType
    ],
):
    r"""Initializes a `MistralTokenizer`.

    Args:
        instruct_tokenizer: The instruct tokenizer to use.
        validator: The request validator to use.
        request_normalizer: The request normalizer to use.
    """
    self._chat_completion_request_validator = validator
    self._instruct_request_normalizer = request_normalizer
    self.instruct_tokenizer: InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType] = (
        instruct_tokenizer
    )

__reduce__()

Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

Returns:

Type Description
Tuple[Callable, Tuple[Any, ...]]

A tuple of the factory function and the arguments to reconstruct the object from its source file.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def __reduce__(self) -> Tuple[Callable, Tuple[Any, ...]]:
    """
    Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

    Returns:
        A tuple of the factory function and the arguments to reconstruct the object from its source file.
    """
    return MistralTokenizer.from_file, (
        self.instruct_tokenizer.tokenizer.file_path,
        self._chat_completion_request_validator._mode,
    )

decode(tokens, special_token_policy=None)

Decodes a list of tokens into a string.

Parameters:

Name Type Description Default
tokens List[int]

The tokens to decode.

required
special_token_policy Optional[SpecialTokenPolicy]

The policy to use for special tokens. Passing None is deprecated and will be changed to SpecialTokenPolicy.IGNORE in mistral_common=1.10.0.

None

Returns:

Type Description
str

The decoded string.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Decodes a list of tokens into a string.

    Args:
        tokens: The tokens to decode.
        special_token_policy: The policy to use for special tokens. Passing `None` is deprecated and will be changed
            to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """
    return self.instruct_tokenizer.decode(tokens, special_token_policy=special_token_policy)

encode_chat_completion(request, max_model_input_len=None)

Encodes a chat completion request.

Parameters:

Name Type Description Default
request ChatCompletionRequest[UATS]

The chat completion request to encode.

required
max_model_input_len Optional[int]

The maximum length of the input to the model. If None, the input will not be truncated.

None

Returns:

Type Description
TokenizedType

The encoded chat completion request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def encode_chat_completion(
    self, request: ChatCompletionRequest[UATS], max_model_input_len: Optional[int] = None
) -> TokenizedType:
    r"""Encodes a chat completion request.

    Args:
        request: The chat completion request to encode.
        max_model_input_len: The maximum length of the input to the model.
            If `None`, the input will not be truncated.

    Returns:
        The encoded chat completion request.
    """

    validated_request = self._chat_completion_request_validator.validate_request(request)

    if max_model_input_len is None and request.truncate_for_context_length:
        # the max_model_input_len arg should not be optional ;
        # but this function is used in many small scripts that have no use
        # for truncation, and don't provide the max model len
        raise TokenizerException(
            "encoding a chat completion request with truncation, but no max model len was provided",
        )

    instruct_request = self._instruct_request_normalizer.from_chat_completion_request(validated_request)

    if request.truncate_for_context_length:
        instruct_request.truncate_at_max_tokens = max_model_input_len

    return self.instruct_tokenizer.encode_instruct(instruct_request)

encode_fim(request)

Encodes a fill in the middle request.

Parameters:

Name Type Description Default
request FIMRequest

The fill in the middle request to encode.

required

Returns:

Type Description
TokenizedType

The encoded fill in the middle request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def encode_fim(self, request: FIMRequest) -> TokenizedType:
    r"""Encodes a fill in the middle request.

    Args:
        request: The fill in the middle request to encode.

    Returns:
        The encoded fill in the middle request.
    """
    return self.instruct_tokenizer.encode_fim(request)

encode_transcription(request)

Encodes a transcription request.

Parameters:

Name Type Description Default
request TranscriptionRequest

The transcription request to encode.

required

Returns:

Type Description
TokenizedType

The encoded transcription request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def encode_transcription(self, request: TranscriptionRequest) -> TokenizedType:
    r"""Encodes a transcription request.

    Args:
        request: The transcription request to encode.

    Returns:
        The encoded transcription request.
    """
    return self.instruct_tokenizer.encode_transcription(request)

from_file(tokenizer_filename, mode=ValidationMode.test) classmethod

Loads a tokenizer from a file.

Parameters:

Name Type Description Default
tokenizer_filename Union[str, Path]

The path to the tokenizer file.

required
mode ValidationMode

The validation mode to use.

test

Returns:

Type Description
MistralTokenizer

The loaded tokenizer.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def from_file(
    cls,
    tokenizer_filename: Union[str, Path],
    mode: ValidationMode = ValidationMode.test,
) -> "MistralTokenizer":
    r"""Loads a tokenizer from a file.

    Args:
        tokenizer_filename: The path to the tokenizer file.
        mode: The validation mode to use.

    Returns:
        The loaded tokenizer.
    """
    tokenizer: Union[SentencePieceTokenizer, Tekkenizer]

    if is_tekken(tokenizer_filename):
        tokenizer = Tekkenizer.from_file(tokenizer_filename)
        image_config = tokenizer.image
        audio_config = tokenizer.audio
    elif is_sentencepiece(tokenizer_filename):
        tokenizer = SentencePieceTokenizer(tokenizer_filename)
        image_config = get_image_config(tokenizer_filename)
        # spm can't have audio
        audio_config = None
    else:
        raise TokenizerException(f"Unrecognized tokenizer file: {tokenizer_filename}")

    image_encoder = load_image_encoder(image_config, tokenizer) if image_config is not None else None

    audio_encoder = None
    if audio_config is not None:
        assert isinstance(tokenizer, Tekkenizer), "Audio is only supported for tekken tokenizers"
        audio_encoder = load_audio_encoder(audio_config, tokenizer)

    request_normalizer = normalizer_for_tokenizer_version(tokenizer.version)

    if tokenizer.version == TokenizerVersion.v1:
        assert image_encoder is None, "Tokenizer version needs to be >= v3"
        assert audio_encoder is None, "Tokenizer version needs to be >= v7"
        return MistralTokenizer(
            InstructTokenizerV1(tokenizer),
            validator=MistralRequestValidator(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v2:
        assert image_encoder is None, "Tokenizer version needs to be >= v3"
        assert audio_encoder is None, "Tokenizer version needs to be >= v7"
        return MistralTokenizer(
            InstructTokenizerV2(tokenizer),
            validator=MistralRequestValidator(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v3:
        assert audio_encoder is None, "Tokenizer version needs to be >= v7"
        return MistralTokenizer(
            InstructTokenizerV3(tokenizer, image_encoder=image_encoder),
            validator=MistralRequestValidatorV3(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v7:
        return MistralTokenizer(
            InstructTokenizerV7(tokenizer, image_encoder=image_encoder, audio_encoder=audio_encoder),
            validator=MistralRequestValidatorV5(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v11:
        return MistralTokenizer(
            InstructTokenizerV11(tokenizer, image_encoder=image_encoder, audio_encoder=audio_encoder),
            validator=MistralRequestValidatorV5(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v13:
        return MistralTokenizer(
            InstructTokenizerV13(tokenizer, image_encoder=image_encoder),
            validator=MistralRequestValidatorV13(mode=mode),
            request_normalizer=request_normalizer,
        )

    raise TokenizerException(f"Unrecognized tokenizer filename: {tokenizer_filename}")

from_hf_hub(repo_id, token=None, revision=None, force_download=False, local_files_only=False, mode=ValidationMode.test) staticmethod

Download the Mistral tokenizer for a given Hugging Face repository ID.

See here for a list of our OSS models.

Parameters:

Name Type Description Default
repo_id str

The Hugging Face repo ID.

required
token Optional[Union[bool, str]]

The Hugging Face token to use to download the tokenizer.

None
revision Optional[str]

The revision of the model to use. If None, the latest revision will be used.

None
mode ValidationMode

The validation mode to use.

test
force_download bool

Whether to force the download of the tokenizer. If True, the tokenizer will be downloaded even if it is already cached.

False
local_files_only bool

Whether to only use local files. If True, the tokenizer will be downloaded only if it is already cached.

False

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer for the given model.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@staticmethod
def from_hf_hub(
    repo_id: str,
    token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    force_download: bool = False,
    local_files_only: bool = False,
    mode: ValidationMode = ValidationMode.test,
) -> "MistralTokenizer":
    r"""Download the Mistral tokenizer for a given Hugging Face repository ID.

    See [here](../../../../models.md#list-of-open-models) for a list of our OSS models.

    Args:
        repo_id: The Hugging Face repo ID.
        token: The Hugging Face token to use to download the tokenizer.
        revision: The revision of the model to use. If `None`, the latest revision will be used.
        mode: The validation mode to use.
        force_download: Whether to force the download of the tokenizer. If `True`, the tokenizer will be downloaded
            even if it is already cached.
        local_files_only: Whether to only use local files. If `True`, the tokenizer will be downloaded only if it is
            already cached.

    Returns:
        The Mistral tokenizer for the given model.
    """
    tokenizer_path = download_tokenizer_from_hf_hub(
        repo_id=repo_id,
        token=token,
        revision=revision,
        force_download=force_download,
        local_files_only=local_files_only,
    )
    return MistralTokenizer.from_file(tokenizer_path, mode=mode)

from_model(model, strict=False) classmethod

Get the Mistral tokenizer for a given model.

Parameters:

Name Type Description Default
model str

The model name.

required
strict bool

Whether to use strict model name matching. If False, the model name is matched as a substring. This is deprecated and will be removed in mistral_common=1.10.0.

False

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer for the given model.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def from_model(cls, model: str, strict: bool = False) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer for a given model.

    Args:
        model: The model name.
        strict: Whether to use strict model name matching. If `False`, the model name is matched as a substring.
            This is deprecated and will be removed in `mistral_common=1.10.0`.

    Returns:
        The Mistral tokenizer for the given model.
    """
    if not strict:
        warnings.warn(
            "Calling `MistralTokenizer.from_model(..., strict=False)` is deprecated as it can lead to incorrect "
            "tokenizers. It is strongly recommended to use MistralTokenizer.from_model(..., strict=True)` "
            "which will become the default in `mistral_common=1.10.0`."
            "If you are using `mistral_common` for open-sourced model weights, we recommend using "
            "`MistralTokenizer.from_file('<path/to/tokenizer/file>')` instead.",
            FutureWarning,
        )

        # TODO(Delete this code in mistral_common >= 1.10.0
        # Prefix search the model name mapping
        for model_name, tokenizer_cls in MODEL_NAME_TO_TOKENIZER_CLS.items():
            if model_name in model.lower():
                return tokenizer_cls()

    if model not in MODEL_NAME_TO_TOKENIZER_CLS:
        raise TokenizerException(f"Unrecognized model: {model}")

    return MODEL_NAME_TO_TOKENIZER_CLS[model]()

v1() classmethod

Get the Mistral tokenizer v1.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v1(cls) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v1."""
    return cls.from_file(str(cls._data_path() / "tokenizer.model.v1"), mode=ValidationMode.test)

v2() classmethod

Get the Mistral tokenizer v2.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v2(cls) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v2."""
    return cls.from_file(
        str(cls._data_path() / "mistral_instruct_tokenizer_240216.model.v2"), mode=ValidationMode.test
    )

v3(is_tekken=False, is_mm=False) classmethod

Get the Mistral tokenizer v3.

Parameters:

Name Type Description Default
is_tekken bool

Whether the tokenizer is a tekken tokenizer. See Tekkenizer.

False
is_mm bool

Whether to load image tokenizer.

False

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer v3.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v3(cls, is_tekken: bool = False, is_mm: bool = False) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v3.

    Args:
        is_tekken: Whether the tokenizer is a tekken tokenizer. See
            [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer].
        is_mm: Whether to load image tokenizer.

    Returns:
        The Mistral tokenizer v3.
    """
    if is_tekken and is_mm:
        tokenizer_name = "tekken_240911.json"
    elif is_tekken and not is_mm:
        tokenizer_name = "tekken_240718.json"
    elif not is_tekken and is_mm:
        raise ValueError("Multimodal tokenizer is currently only supported for tekken")
    else:
        tokenizer_name = "mistral_instruct_tokenizer_240323.model.v3"

    return cls.from_file(str(cls._data_path() / tokenizer_name), mode=ValidationMode.test)

v7(is_mm=False) classmethod

Get the Mistral tokenizer v7.

Parameters:

Name Type Description Default
is_mm bool

Whether to load the image tokenizer.

False

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer v7.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v7(cls, is_mm: bool = False) -> "MistralTokenizer":
    """Get the Mistral tokenizer v7.

    Args:
        is_mm: Whether to load the image tokenizer.

    Returns:
        The Mistral tokenizer v7.
    """
    if is_mm:
        return cls.from_file(
            str(cls._data_path() / "mistral_instruct_tokenizer_241114.model.v7m1"), mode=ValidationMode.test
        )
    else:
        return cls.from_file(
            str(cls._data_path() / "mistral_instruct_tokenizer_241114.model.v7"), mode=ValidationMode.test
        )

load_audio_encoder(audio_config, tokenizer)

Load a audio encoder from a config and a tokenizer.

Parameters:

Name Type Description Default
audio_config AudioConfig

The audio config.

required
tokenizer Tekkenizer

The tokenizer.

required

Returns:

Type Description
AudioEncoder

The audio encoder.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def load_audio_encoder(audio_config: AudioConfig, tokenizer: Tekkenizer) -> AudioEncoder:
    r"""Load a audio encoder from a config and a tokenizer.

    Args:
        audio_config: The audio config.
        tokenizer: The tokenizer.

    Returns:
        The audio encoder.
    """
    special_ids = SpecialAudioIDs(
        audio=tokenizer.get_control_token(SpecialTokens.audio.value),
        begin_audio=tokenizer.get_control_token(SpecialTokens.begin_audio.value),
    )
    return AudioEncoder(audio_config, special_ids)

load_image_encoder(image_config, tokenizer)

Load a image encoder from a config and a tokenizer.

Parameters:

Name Type Description Default
image_config ImageConfig

The image config.

required
tokenizer Union[Tekkenizer, SentencePieceTokenizer]

The tokenizer.

required

Returns:

Type Description
ImageEncoder

The image encoder.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def load_image_encoder(image_config: ImageConfig, tokenizer: Union[Tekkenizer, SentencePieceTokenizer]) -> ImageEncoder:
    r"""Load a image encoder from a config and a tokenizer.

    Args:
        image_config: The image config.
        tokenizer: The tokenizer.

    Returns:
        The image encoder.
    """
    special_ids = SpecialImageIDs(
        img=tokenizer.get_control_token(SpecialTokens.img.value),
        img_break=tokenizer.get_control_token(SpecialTokens.img_break.value),
        img_end=tokenizer.get_control_token(SpecialTokens.img_end.value),
    )
    return ImageEncoder(image_config, special_ids)