Skip to content

mistral_common.tokens.tokenizers.mistral

MistralTokenizer(instruct_tokenizer, validator, request_normalizer)

Bases: Generic[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, TokenizedType]

Mistral tokenizer.

This class is a wrapper around a InstructTokenizer, a MistralRequestValidator and a InstructRequestNormalizer.

It provides a convenient interface to tokenize, validate ad normalize Mistral requests.

Parameters:

Name Type Description Default
instruct_tokenizer InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType]

The instruct tokenizer to use.

required
validator MistralRequestValidator[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType]

The request validator to use.

required
request_normalizer InstructRequestNormalizer[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, InstructRequestType]

The request normalizer to use.

required
Source code in src/mistral_common/tokens/tokenizers/mistral.py
def __init__(
    self,
    instruct_tokenizer: InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType],
    validator: MistralRequestValidator[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType],
    request_normalizer: InstructRequestNormalizer[
        UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, InstructRequestType
    ],
):
    r"""Initializes a `MistralTokenizer`.

    Args:
        instruct_tokenizer: The instruct tokenizer to use.
        validator: The request validator to use.
        request_normalizer: The request normalizer to use.
    """
    self._chat_completion_request_validator = validator
    self._instruct_request_normalizer = request_normalizer
    self.instruct_tokenizer = instruct_tokenizer

decode(tokens)

Decodes a list of tokens into a string.

Parameters:

Name Type Description Default
tokens List[int]

The tokens to decode.

required

Returns:

Type Description
str

The decoded string.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def decode(self, tokens: List[int]) -> str:
    r"""Decodes a list of tokens into a string.

    Args:
        tokens: The tokens to decode.

    Returns:
        The decoded string.
    """
    return self.instruct_tokenizer.decode(tokens)

encode_chat_completion(request, max_model_input_len=None)

Encodes a chat completion request.

Parameters:

Name Type Description Default
request ChatCompletionRequest[UATS]

The chat completion request to encode.

required
max_model_input_len Optional[int]

The maximum length of the input to the model. If None, the input will not be truncated.

None

Returns:

Type Description
TokenizedType

The encoded chat completion request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def encode_chat_completion(
    self, request: ChatCompletionRequest[UATS], max_model_input_len: Optional[int] = None
) -> TokenizedType:
    r"""Encodes a chat completion request.

    Args:
        request: The chat completion request to encode.
        max_model_input_len: The maximum length of the input to the model.
            If `None`, the input will not be truncated.

    Returns:
        The encoded chat completion request.
    """

    validated_request = self._chat_completion_request_validator.validate_request(request)

    if max_model_input_len is None and request.truncate_for_context_length:
        # the max_model_input_len arg should not be optionnal ;
        # but this function is used in many small scripts that have no use
        # for truncation, and don't provide the max model len
        raise TokenizerException(
            "encoding a chat completion request with truncation, but no max model len was provided",
        )

    instruct_request = self._instruct_request_normalizer.from_chat_completion_request(validated_request)

    if request.truncate_for_context_length:
        instruct_request.truncate_at_max_tokens = max_model_input_len

    return self.instruct_tokenizer.encode_instruct(instruct_request)

encode_fim(request)

Encodes a fill in the middle request.

Parameters:

Name Type Description Default
request FIMRequest

The fill in the middle request to encode.

required

Returns:

Type Description
TokenizedType

The encoded fill in the middle request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def encode_fim(self, request: FIMRequest) -> TokenizedType:
    r"""Encodes a fill in the middle request.

    Args:
        request: The fill in the middle request to encode.

    Returns:
        The encoded fill in the middle request.
    """
    return self.instruct_tokenizer.encode_fim(request)

from_file(tokenizer_filename, mode=ValidationMode.test) classmethod

Loads a tokenizer from a file.

Parameters:

Name Type Description Default
tokenizer_filename str

The path to the tokenizer file.

required
mode ValidationMode

The validation mode to use.

test

Returns:

Type Description
MistralTokenizer

The loaded tokenizer.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def from_file(
    cls,
    tokenizer_filename: str,
    mode: ValidationMode = ValidationMode.test,
) -> "MistralTokenizer":
    r"""Loads a tokenizer from a file.

    Args:
        tokenizer_filename: The path to the tokenizer file.
        mode: The validation mode to use.

    Returns:
        The loaded tokenizer.
    """
    tokenizer: Union[SentencePieceTokenizer, Tekkenizer]

    if is_tekken(tokenizer_filename):
        tokenizer = Tekkenizer.from_file(tokenizer_filename)
        mm_config = tokenizer.multimodal
    elif is_sentencepiece(tokenizer_filename):
        tokenizer = SentencePieceTokenizer(tokenizer_filename)
        mm_config = get_mm_config(tokenizer_filename)
    else:
        raise TokenizerException(f"Unrecognized tokenizer file: {tokenizer_filename}")

    mm_encoder = load_mm_encoder(mm_config, tokenizer) if mm_config is not None else None

    request_normalizer = normalizer_for_tokenizer_version(tokenizer.version)

    if tokenizer.version == TokenizerVersion.v1:
        assert mm_encoder is None, "Tokenizer version needs to be >= v3"
        return MistralTokenizer(
            InstructTokenizerV1(tokenizer),
            validator=MistralRequestValidator(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v2:
        assert mm_encoder is None, "Tokenizer version needs to be >= v3"
        return MistralTokenizer(
            InstructTokenizerV2(tokenizer),
            validator=MistralRequestValidator(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v3:
        return MistralTokenizer(
            InstructTokenizerV3(tokenizer, mm_encoder=mm_encoder),
            validator=MistralRequestValidatorV3(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v7:
        return MistralTokenizer(
            InstructTokenizerV7(tokenizer, mm_encoder=mm_encoder),
            validator=MistralRequestValidatorV5(mode=mode),
            request_normalizer=request_normalizer,
        )

    raise TokenizerException(f"Unrecognized tokenizer filename: {tokenizer_filename}")

from_hf_hub(model_id, **kwargs) staticmethod

Get the Mistral tokenizer for a given Hugging Face model ID.

See here for a list of our OSS models.

Parameters:

Name Type Description Default
model_id str

The Hugging Face model ID.

required
kwargs Any

Additional keyword arguments to pass to huggingface_hub.hf_hub_download.

{}

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer for the given model.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@staticmethod
def from_hf_hub(model_id: str, **kwargs: Any) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer for a given Hugging Face model ID.

    See [here](../../../../models.md#list-of-open-models) for a list of our OSS models.

    Args:
        model_id: The Hugging Face model ID.
        kwargs: Additional keyword arguments to pass to `huggingface_hub.hf_hub_download`.

    Returns:
        The Mistral tokenizer for the given model.
    """
    tokenizer_path = download_tokenizer_from_hf_hub(model_id, **kwargs)
    return MistralTokenizer.from_file(tokenizer_path)

from_model(model, strict=False) classmethod

Get the Mistral tokenizer for a given model.

Parameters:

Name Type Description Default
model str

The model name.

required
strict bool

Whether to use strict model name matching. If False, the model name is matched as a substring. This is deprecated and will be removed in mistral_common=1.6.0.

False

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer for the given model.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def from_model(cls, model: str, strict: bool = False) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer for a given model.

    Args:
        model: The model name.
        strict: Whether to use strict model name matching. If `False`, the model name is matched as a substring.
            This is deprecated and will be removed in `mistral_common=1.6.0`.

    Returns:
        The Mistral tokenizer for the given model.
    """
    if not strict:
        warnings.warn(
            "Calling `MistralTokenizer.from_model(..., strict=False)` is deprecated as it can lead to incorrect "
            "tokenizers. It is strongly recommended to use MistralTokenizer.from_model(..., strict=True)` "
            "which will become the default in `mistral_common=1.6.0`."
            "If you are using `mistral_common` for open-sourced model weights, we recommend using "
            "`MistralTokenizer.from_file('<path/to/tokenizer/file>')` instead.",
            FutureWarning,
        )

        # TODO(Delete this code in mistral_common >= 1.6.0
        # Prefix search the model name mapping
        for model_name, tokenizer_cls in MODEL_NAME_TO_TOKENIZER_CLS.items():
            if model_name in model.lower():
                return tokenizer_cls()

    if model not in MODEL_NAME_TO_TOKENIZER_CLS:
        raise TokenizerException(f"Unrecognized model: {model}")

    return MODEL_NAME_TO_TOKENIZER_CLS[model]()

v1() classmethod

Get the Mistral tokenizer v1.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v1(cls) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v1."""
    return cls.from_file(str(cls._data_path() / "tokenizer.model.v1"), mode=ValidationMode.test)

v2() classmethod

Get the Mistral tokenizer v2.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v2(cls) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v2."""
    return cls.from_file(
        str(cls._data_path() / "mistral_instruct_tokenizer_240216.model.v2"), mode=ValidationMode.test
    )

v3(is_tekken=False, is_mm=False) classmethod

Get the Mistral tokenizer v3.

Parameters:

Name Type Description Default
is_tekken bool

Whether the tokenizer is a tekken tokenizer. See Tekkenizer.

False
is_mm bool

Whether to load multimodal tokenizer.

False

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer v3.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v3(cls, is_tekken: bool = False, is_mm: bool = False) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v3.

    Args:
        is_tekken: Whether the tokenizer is a tekken tokenizer. See
            [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer].
        is_mm: Whether to load multimodal tokenizer.

    Returns:
        The Mistral tokenizer v3.
    """
    if is_tekken and is_mm:
        tokenizer_name = "tekken_240911.json"
    elif is_tekken and not is_mm:
        tokenizer_name = "tekken_240718.json"
    elif not is_tekken and is_mm:
        raise ValueError("Multimodal tokenizer is currently only supported for tekken")
    else:
        tokenizer_name = "mistral_instruct_tokenizer_240323.model.v3"

    return cls.from_file(str(cls._data_path() / tokenizer_name), mode=ValidationMode.test)

v7(is_mm=False) classmethod

Get the Mistral tokenizer v7.

Parameters:

Name Type Description Default
is_mm bool

Whether to load the multimodal tokenizer.

False

Returns:

Type Description
MistralTokenizer

The Mistral tokenizer v7.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
@classmethod
def v7(cls, is_mm: bool = False) -> "MistralTokenizer":
    """Get the Mistral tokenizer v7.

    Args:
        is_mm: Whether to load the multimodal tokenizer.

    Returns:
        The Mistral tokenizer v7.
    """
    if is_mm:
        return cls.from_file(
            str(cls._data_path() / "mistral_instruct_tokenizer_241114.model.v7m1"), mode=ValidationMode.test
        )
    else:
        return cls.from_file(
            str(cls._data_path() / "mistral_instruct_tokenizer_241114.model.v7"), mode=ValidationMode.test
        )

load_mm_encoder(mm_config, tokenizer)

Load a multi-modal encoder from a config and a tokenizer.

Parameters:

Name Type Description Default
mm_config MultimodalConfig

The multi-modal config.

required
tokenizer Union[Tekkenizer, SentencePieceTokenizer]

The tokenizer.

required

Returns:

Type Description
MultiModalEncoder

The multi-modal encoder.

Source code in src/mistral_common/tokens/tokenizers/mistral.py
def load_mm_encoder(
    mm_config: MultimodalConfig, tokenizer: Union[Tekkenizer, SentencePieceTokenizer]
) -> MultiModalEncoder:
    r"""Load a multi-modal encoder from a config and a tokenizer.

    Args:
        mm_config: The multi-modal config.
        tokenizer: The tokenizer.

    Returns:
        The multi-modal encoder.
    """
    special_ids = SpecialImageIDs(
        img=tokenizer.get_control_token(SpecialTokens.img.value),
        img_break=tokenizer.get_control_token(SpecialTokens.img_break.value),
        img_end=tokenizer.get_control_token(SpecialTokens.img_end.value),
    )
    return ImageEncoder(mm_config, special_ids)