`mistral_common.tokens.tokenizers.mistral`

`MistralTokenizer(instruct_tokenizer, validator, request_normalizer)`

Bases: Generic[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, TokenizedType]

Mistral tokenizer.

This class is a wrapper around a InstructTokenizer, a MistralRequestValidator and a InstructRequestNormalizer.

It provides a convenient interface to tokenize, validate ad normalize Mistral requests.

Attributes:

Name	Type	Description
`instruct_tokenizer`	`InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType]`	The instruct tokenizer to use. See InstructTokenizer.

Parameters:

Name	Type	Description	Default
`instruct_tokenizer`	`InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType]`	The instruct tokenizer to use.	required
`validator`	`MistralRequestValidator[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType]`	The request validator to use.	required
`request_normalizer`	`InstructRequestNormalizer[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, InstructRequestType]`	The request normalizer to use.	required

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def __init__(
    self,
    instruct_tokenizer: InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType],
    validator: MistralRequestValidator[UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType],
    request_normalizer: InstructRequestNormalizer[
        UserMessageType, AssistantMessageType, ToolMessageType, SystemMessageType, InstructRequestType
    ],
):
    r"""Initializes a `MistralTokenizer`.

    Args:
        instruct_tokenizer: The instruct tokenizer to use.
        validator: The request validator to use.
        request_normalizer: The request normalizer to use.
    """
    self._chat_completion_request_validator = validator
    self._instruct_request_normalizer = request_normalizer
    self.instruct_tokenizer: InstructTokenizer[InstructRequest, FIMRequest, TokenizedType, AssistantMessageType] = (
        instruct_tokenizer
    )

`reduce()`

Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

Returns:

Type	Description
`Tuple[Callable, Tuple[Any, ...]]`	A tuple of the factory function and the arguments to reconstruct the object from its source file.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def __reduce__(self) -> Tuple[Callable, Tuple[Any, ...]]:
    """
    Provides a recipe for pickling (serializing) this object, which is necessary for use with multiprocessing.

    Returns:
        A tuple of the factory function and the arguments to reconstruct the object from its source file.
    """
    return MistralTokenizer.from_file, (
        self.instruct_tokenizer.tokenizer.file_path,
        self._chat_completion_request_validator._mode,
    )

`decode(tokens, special_token_policy=None)`

Decodes a list of tokens into a string.

Parameters:

Name	Type	Description	Default
`tokens`	`List[int]`	The tokens to decode.	required
`special_token_policy`	`Optional[SpecialTokenPolicy]`	The policy to use for special tokens. Passing `None` is deprecated and will be changed to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.	`None`

Returns:

Type	Description
`str`	The decoded string.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Decodes a list of tokens into a string.

    Args:
        tokens: The tokens to decode.
        special_token_policy: The policy to use for special tokens. Passing `None` is deprecated and will be changed
            to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """
    return self.instruct_tokenizer.decode(tokens, special_token_policy=special_token_policy)

`encode_chat_completion(request, max_model_input_len=None)`

Encodes a chat completion request.

Parameters:

Name	Type	Description	Default
`request`	`ChatCompletionRequest[UATS]`	The chat completion request to encode.	required
`max_model_input_len`	`Optional[int]`	The maximum length of the input to the model. If `None`, the input will not be truncated.	`None`

Returns:

Type	Description
`TokenizedType`	The encoded chat completion request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def encode_chat_completion(
    self, request: ChatCompletionRequest[UATS], max_model_input_len: Optional[int] = None
) -> TokenizedType:
    r"""Encodes a chat completion request.

    Args:
        request: The chat completion request to encode.
        max_model_input_len: The maximum length of the input to the model.
            If `None`, the input will not be truncated.

    Returns:
        The encoded chat completion request.
    """

    validated_request = self._chat_completion_request_validator.validate_request(request)

    if max_model_input_len is None and request.truncate_for_context_length:
        # the max_model_input_len arg should not be optional ;
        # but this function is used in many small scripts that have no use
        # for truncation, and don't provide the max model len
        raise TokenizerException(
            "encoding a chat completion request with truncation, but no max model len was provided",
        )

    instruct_request = self._instruct_request_normalizer.from_chat_completion_request(validated_request)

    if request.truncate_for_context_length:
        instruct_request.truncate_at_max_tokens = max_model_input_len

    return self.instruct_tokenizer.encode_instruct(instruct_request)

`encode_fim(request)`

Encodes a fill in the middle request.

Parameters:

Name	Type	Description	Default
`request`	`FIMRequest`	The fill in the middle request to encode.	required

Returns:

Type	Description
`TokenizedType`	The encoded fill in the middle request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def encode_fim(self, request: FIMRequest) -> TokenizedType:
    r"""Encodes a fill in the middle request.

    Args:
        request: The fill in the middle request to encode.

    Returns:
        The encoded fill in the middle request.
    """
    return self.instruct_tokenizer.encode_fim(request)

`encode_transcription(request)`

Encodes a transcription request.

Parameters:

Name	Type	Description	Default
`request`	`TranscriptionRequest`	The transcription request to encode.	required

Returns:

Type	Description
`TokenizedType`	The encoded transcription request.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def encode_transcription(self, request: TranscriptionRequest) -> TokenizedType:
    r"""Encodes a transcription request.

    Args:
        request: The transcription request to encode.

    Returns:
        The encoded transcription request.
    """
    return self.instruct_tokenizer.encode_transcription(request)

`from_file(tokenizer_filename, mode=ValidationMode.test)` `classmethod`

Loads a tokenizer from a file.

Parameters:

Name	Type	Description	Default
`tokenizer_filename`	`Union[str, Path]`	The path to the tokenizer file.	required
`mode`	`ValidationMode`	The validation mode to use.	`test`

Returns:

Type	Description
`MistralTokenizer`	The loaded tokenizer.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

@classmethod
def from_file(
    cls,
    tokenizer_filename: Union[str, Path],
    mode: ValidationMode = ValidationMode.test,
) -> "MistralTokenizer":
    r"""Loads a tokenizer from a file.

    Args:
        tokenizer_filename: The path to the tokenizer file.
        mode: The validation mode to use.

    Returns:
        The loaded tokenizer.
    """
    tokenizer: Union[SentencePieceTokenizer, Tekkenizer]

    if is_tekken(tokenizer_filename):
        tokenizer = Tekkenizer.from_file(tokenizer_filename)
        image_config = tokenizer.image
        audio_config = tokenizer.audio
    elif is_sentencepiece(tokenizer_filename):
        tokenizer = SentencePieceTokenizer(tokenizer_filename)
        image_config = get_image_config(tokenizer_filename)
        # spm can't have audio
        audio_config = None
    else:
        raise TokenizerException(f"Unrecognized tokenizer file: {tokenizer_filename}")

    image_encoder = load_image_encoder(image_config, tokenizer) if image_config is not None else None

    audio_encoder = None
    if audio_config is not None:
        assert isinstance(tokenizer, Tekkenizer), "Audio is only supported for tekken tokenizers"
        audio_encoder = load_audio_encoder(audio_config, tokenizer)

    request_normalizer = normalizer_for_tokenizer_version(tokenizer.version)

    if tokenizer.version == TokenizerVersion.v1:
        assert image_encoder is None, "Tokenizer version needs to be >= v3"
        assert audio_encoder is None, "Tokenizer version needs to be >= v7"
        return MistralTokenizer(
            InstructTokenizerV1(tokenizer),
            validator=MistralRequestValidator(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v2:
        assert image_encoder is None, "Tokenizer version needs to be >= v3"
        assert audio_encoder is None, "Tokenizer version needs to be >= v7"
        return MistralTokenizer(
            InstructTokenizerV2(tokenizer),
            validator=MistralRequestValidator(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v3:
        assert audio_encoder is None, "Tokenizer version needs to be >= v7"
        return MistralTokenizer(
            InstructTokenizerV3(tokenizer, image_encoder=image_encoder),
            validator=MistralRequestValidatorV3(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v7:
        return MistralTokenizer(
            InstructTokenizerV7(tokenizer, image_encoder=image_encoder, audio_encoder=audio_encoder),
            validator=MistralRequestValidatorV5(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v11:
        return MistralTokenizer(
            InstructTokenizerV11(tokenizer, image_encoder=image_encoder, audio_encoder=audio_encoder),
            validator=MistralRequestValidatorV5(mode=mode),
            request_normalizer=request_normalizer,
        )
    elif tokenizer.version == TokenizerVersion.v13:
        return MistralTokenizer(
            InstructTokenizerV13(tokenizer, image_encoder=image_encoder),
            validator=MistralRequestValidatorV13(mode=mode),
            request_normalizer=request_normalizer,
        )

    raise TokenizerException(f"Unrecognized tokenizer filename: {tokenizer_filename}")

`from_hf_hub(repo_id, token=None, revision=None, force_download=False, local_files_only=False, mode=ValidationMode.test)` `staticmethod`

Download the Mistral tokenizer for a given Hugging Face repository ID.

See here for a list of our OSS models.

Parameters:

Name	Type	Description	Default
`repo_id`	`str`	The Hugging Face repo ID.	required
`token`	`Optional[Union[bool, str]]`	The Hugging Face token to use to download the tokenizer.	`None`
`revision`	`Optional[str]`	The revision of the model to use. If `None`, the latest revision will be used.	`None`
`mode`	`ValidationMode`	The validation mode to use.	`test`
`force_download`	`bool`	Whether to force the download of the tokenizer. If `True`, the tokenizer will be downloaded even if it is already cached.	`False`
`local_files_only`	`bool`	Whether to only use local files. If `True`, the tokenizer will be downloaded only if it is already cached.	`False`

Returns:

Type	Description
`MistralTokenizer`	The Mistral tokenizer for the given model.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

@staticmethod
def from_hf_hub(
    repo_id: str,
    token: Optional[Union[bool, str]] = None,
    revision: Optional[str] = None,
    force_download: bool = False,
    local_files_only: bool = False,
    mode: ValidationMode = ValidationMode.test,
) -> "MistralTokenizer":
    r"""Download the Mistral tokenizer for a given Hugging Face repository ID.

    See [here](../../../../models.md#list-of-open-models) for a list of our OSS models.

    Args:
        repo_id: The Hugging Face repo ID.
        token: The Hugging Face token to use to download the tokenizer.
        revision: The revision of the model to use. If `None`, the latest revision will be used.
        mode: The validation mode to use.
        force_download: Whether to force the download of the tokenizer. If `True`, the tokenizer will be downloaded
            even if it is already cached.
        local_files_only: Whether to only use local files. If `True`, the tokenizer will be downloaded only if it is
            already cached.

    Returns:
        The Mistral tokenizer for the given model.
    """
    tokenizer_path = download_tokenizer_from_hf_hub(
        repo_id=repo_id,
        token=token,
        revision=revision,
        force_download=force_download,
        local_files_only=local_files_only,
    )
    return MistralTokenizer.from_file(tokenizer_path, mode=mode)

`from_model(model, strict=False)` `classmethod`

Get the Mistral tokenizer for a given model.

Parameters:

Name	Type	Description	Default
`model`	`str`	The model name.	required
`strict`	`bool`	Whether to use strict model name matching. If `False`, the model name is matched as a substring. This is deprecated and will be removed in `mistral_common=1.10.0`.	`False`

Returns:

Type	Description
`MistralTokenizer`	The Mistral tokenizer for the given model.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

@classmethod
def from_model(cls, model: str, strict: bool = False) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer for a given model.

    Args:
        model: The model name.
        strict: Whether to use strict model name matching. If `False`, the model name is matched as a substring.
            This is deprecated and will be removed in `mistral_common=1.10.0`.

    Returns:
        The Mistral tokenizer for the given model.
    """
    if not strict:
        warnings.warn(
            "Calling `MistralTokenizer.from_model(..., strict=False)` is deprecated as it can lead to incorrect "
            "tokenizers. It is strongly recommended to use MistralTokenizer.from_model(..., strict=True)` "
            "which will become the default in `mistral_common=1.10.0`."
            "If you are using `mistral_common` for open-sourced model weights, we recommend using "
            "`MistralTokenizer.from_file('<path/to/tokenizer/file>')` instead.",
            FutureWarning,
        )

        # TODO(Delete this code in mistral_common >= 1.10.0
        # Prefix search the model name mapping
        for model_name, tokenizer_cls in MODEL_NAME_TO_TOKENIZER_CLS.items():
            if model_name in model.lower():
                return tokenizer_cls()

    if model not in MODEL_NAME_TO_TOKENIZER_CLS:
        raise TokenizerException(f"Unrecognized model: {model}")

    return MODEL_NAME_TO_TOKENIZER_CLS[model]()

`v1()` `classmethod`

Get the Mistral tokenizer v1.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

@classmethod
def v1(cls) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v1."""
    return cls.from_file(str(cls._data_path() / "tokenizer.model.v1"), mode=ValidationMode.test)

`v2()` `classmethod`

Get the Mistral tokenizer v2.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

@classmethod
def v2(cls) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v2."""
    return cls.from_file(
        str(cls._data_path() / "mistral_instruct_tokenizer_240216.model.v2"), mode=ValidationMode.test
    )

`v3(is_tekken=False, is_mm=False)` `classmethod`

Get the Mistral tokenizer v3.

Parameters:

Name	Type	Description	Default
`is_tekken`	`bool`	Whether the tokenizer is a tekken tokenizer. See Tekkenizer.	`False`
`is_mm`	`bool`	Whether to load image tokenizer.	`False`

Returns:

Type	Description
`MistralTokenizer`	The Mistral tokenizer v3.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

@classmethod
def v3(cls, is_tekken: bool = False, is_mm: bool = False) -> "MistralTokenizer":
    r"""Get the Mistral tokenizer v3.

    Args:
        is_tekken: Whether the tokenizer is a tekken tokenizer. See
            [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer].
        is_mm: Whether to load image tokenizer.

    Returns:
        The Mistral tokenizer v3.
    """
    if is_tekken and is_mm:
        tokenizer_name = "tekken_240911.json"
    elif is_tekken and not is_mm:
        tokenizer_name = "tekken_240718.json"
    elif not is_tekken and is_mm:
        raise ValueError("Multimodal tokenizer is currently only supported for tekken")
    else:
        tokenizer_name = "mistral_instruct_tokenizer_240323.model.v3"

    return cls.from_file(str(cls._data_path() / tokenizer_name), mode=ValidationMode.test)

`v7(is_mm=False)` `classmethod`

Get the Mistral tokenizer v7.

Parameters:

Name	Type	Description	Default
`is_mm`	`bool`	Whether to load the image tokenizer.	`False`

Returns:

Type	Description
`MistralTokenizer`	The Mistral tokenizer v7.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

@classmethod
def v7(cls, is_mm: bool = False) -> "MistralTokenizer":
    """Get the Mistral tokenizer v7.

    Args:
        is_mm: Whether to load the image tokenizer.

    Returns:
        The Mistral tokenizer v7.
    """
    if is_mm:
        return cls.from_file(
            str(cls._data_path() / "mistral_instruct_tokenizer_241114.model.v7m1"), mode=ValidationMode.test
        )
    else:
        return cls.from_file(
            str(cls._data_path() / "mistral_instruct_tokenizer_241114.model.v7"), mode=ValidationMode.test
        )

`load_audio_encoder(audio_config, tokenizer)`

Load a audio encoder from a config and a tokenizer.

Parameters:

Name	Type	Description	Default
`audio_config`	`AudioConfig`	The audio config.	required
`tokenizer`	`Tekkenizer`	The tokenizer.	required

Returns:

Type	Description
`AudioEncoder`	The audio encoder.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def load_audio_encoder(audio_config: AudioConfig, tokenizer: Tekkenizer) -> AudioEncoder:
    r"""Load a audio encoder from a config and a tokenizer.

    Args:
        audio_config: The audio config.
        tokenizer: The tokenizer.

    Returns:
        The audio encoder.
    """
    special_ids = SpecialAudioIDs(
        audio=tokenizer.get_control_token(SpecialTokens.audio.value),
        begin_audio=tokenizer.get_control_token(SpecialTokens.begin_audio.value),
    )
    return AudioEncoder(audio_config, special_ids)

`load_image_encoder(image_config, tokenizer)`

Load a image encoder from a config and a tokenizer.

Parameters:

Name	Type	Description	Default
`image_config`	`ImageConfig`	The image config.	required
`tokenizer`	`Union[Tekkenizer, SentencePieceTokenizer]`	The tokenizer.	required

Returns:

Type	Description
`ImageEncoder`	The image encoder.

Source code in src/mistral_common/tokens/tokenizers/mistral.py

def load_image_encoder(image_config: ImageConfig, tokenizer: Union[Tekkenizer, SentencePieceTokenizer]) -> ImageEncoder:
    r"""Load a image encoder from a config and a tokenizer.

    Args:
        image_config: The image config.
        tokenizer: The tokenizer.

    Returns:
        The image encoder.
    """
    special_ids = SpecialImageIDs(
        img=tokenizer.get_control_token(SpecialTokens.img.value),
        img_break=tokenizer.get_control_token(SpecialTokens.img_break.value),
        img_end=tokenizer.get_control_token(SpecialTokens.img_end.value),
    )
    return ImageEncoder(image_config, special_ids)

mistral_common.tokens.tokenizers.mistral

MistralTokenizer(instruct_tokenizer, validator, request_normalizer)

__reduce__()

decode(tokens, special_token_policy=None)

encode_chat_completion(request, max_model_input_len=None)

encode_fim(request)

encode_transcription(request)

from_file(tokenizer_filename, mode=ValidationMode.test) classmethod

from_hf_hub(repo_id, token=None, revision=None, force_download=False, local_files_only=False, mode=ValidationMode.test) staticmethod

from_model(model, strict=False) classmethod

v1() classmethod

v2() classmethod

v3(is_tekken=False, is_mm=False) classmethod

v7(is_mm=False) classmethod

load_audio_encoder(audio_config, tokenizer)

load_image_encoder(image_config, tokenizer)

`mistral_common.tokens.tokenizers.mistral`

`MistralTokenizer(instruct_tokenizer, validator, request_normalizer)`

`reduce()`

`decode(tokens, special_token_policy=None)`

`encode_chat_completion(request, max_model_input_len=None)`

`encode_fim(request)`

`encode_transcription(request)`

`from_file(tokenizer_filename, mode=ValidationMode.test)` `classmethod`

`from_hf_hub(repo_id, token=None, revision=None, force_download=False, local_files_only=False, mode=ValidationMode.test)` `staticmethod`

`from_model(model, strict=False)` `classmethod`

`v1()` `classmethod`

`v2()` `classmethod`

`v3(is_tekken=False, is_mm=False)` `classmethod`

`v7(is_mm=False)` `classmethod`

`load_audio_encoder(audio_config, tokenizer)`

`load_image_encoder(image_config, tokenizer)`