`mistral_common.tokens.tokenizers.base`

`InstructTokenizer(tokenizer, image_encoder, audio_encoder)`

Bases: Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Base class for instruct tokenizers.

Attributes:

Name	Type	Description
`tokenizer`	`Tokenizer`	The tokenizer to use.
`image_encoder`	`Optional[ImageEncoder]`	The image encoder to use if any.

Parameters:

Name	Type	Description	Default
`tokenizer`	`Tokenizer`	The tokenizer to use.	required
`image_encoder`	`Optional[ImageEncoder]`	The image encoder to use if any.	required
`audio_encoder`	`Optional[AudioEncoder]`	The audio encoder to use if any.	required

Source code in src/mistral_common/tokens/tokenizers/base.py

def __init__(
    self, tokenizer: Tokenizer, image_encoder: Optional[ImageEncoder], audio_encoder: Optional[AudioEncoder]
) -> None:
    r"""Initialize the instruct tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use if any.
        audio_encoder: The audio encoder to use if any.
    """

`decode(tokens, special_token_policy=None)` `abstractmethod`

Convert token ids to string

Parameters:

Name	Type	Description	Default
`tokens`	`List[int]`	The token ids to decode.	required
`special_token_policy`	`Optional[SpecialTokenPolicy]`	The policy to use for special tokens. Passing `None` will default to `self._special_token_policy` for Tekkenizer and `SpecialTokenPolicy.IGNORE` for SentencePieceTokenizer. Note that passing `None` will be deprecated and `special_token_policy` will default to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.	`None`

Returns:

Type	Description
`str`	The decoded string.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Convert token ids to string

    Args:
        tokens: The token ids to decode.
        special_token_policy: The policy to use for special tokens.
            Passing `None` will default to `self._special_token_policy` for
            [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
            for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
            Note that passing `None` will be deprecated and `special_token_policy` will default to
            `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """

`encode_fim(request)` `abstractmethod`

FIM request to Tokenized object

Parameters:

Name	Type	Description	Default
`request`	`FIMRequestType`	The FIM request to encode.	required

Returns:

Type	Description
`TokenizedType`	The tokenized FIM request.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def encode_fim(self, request: FIMRequestType) -> TokenizedType:
    r"""FIM request to Tokenized object

    Args:
        request: The FIM request to encode.

    Returns:
        The tokenized FIM request.
    """

`encode_instruct(request)` `abstractmethod`

Instruct request to Tokenized object

Parameters:

Name	Type	Description	Default
`request`	`InstructRequestType`	The instruct request to encode.	required

Returns:

Type	Description
`TokenizedType`	The tokenized instruct request.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def encode_instruct(self, request: InstructRequestType) -> TokenizedType:
    r"""Instruct request to Tokenized object

    Args:
        request: The instruct request to encode.

    Returns:
        The tokenized instruct request.
    """

`encode_transcription(request)` `abstractmethod`

Encodes an audio transcription request into a tokenized format.

This method processes a transcription request containing audio data, encodes the user message, and returns the tokenized output.

Parameters:

Name	Type	Description	Default
`request`	`TranscriptionRequest`	The transcription request object containing the audio data to be encoded.	required

Returns:

Type	Description
`Tokenized`	The tokenized representation of the audio data, including processed audio and tokens

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def encode_transcription(self, request: TranscriptionRequest) -> TokenizedType:
    r"""
    Encodes an audio transcription request into a tokenized format.

    This method processes a transcription request containing audio data,
    encodes the user message, and returns the tokenized output.

    Args:
        request: The transcription request object containing
            the audio data to be encoded.

    Returns:
        Tokenized: The tokenized representation of the audio data, including processed audio and tokens
    """
    ...

`encode_user_content(content, is_last, system_prompt=None, force_img_first=False)` `abstractmethod`

Encode a user content.

Parameters:

Name	Type	Description	Default
`content`	`Union[str, List[ContentChunk]]`	The user content to encode.	required
`is_last`	`bool`	Whether the content is the last one.	required
`system_prompt`	`Optional[str]`	The system prompt.	`None`
`force_img_first`	`bool`	Whether to force the image to be first.	`False`

Returns:

Type	Description
`Tuple[List[int], List[ndarray], List[Audio]]`	The encoded tokens and images.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def encode_user_content(
    self,
    content: Union[str, List[ContentChunk]],
    is_last: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray], List[Audio]]:
    r"""Encode a user content.

    Args:
        content: The user content to encode.
        is_last: Whether the content is the last one.
        system_prompt: The system prompt.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and images.
    """
    ...

`encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)` `abstractmethod`

Encode a user message.

Parameters:

Name	Type	Description	Default
`message`	`UserMessage`	The user message to encode.	required
`available_tools`	`Optional[List[Tool]]`	The available tools.	required
`is_last`	`bool`	Whether the message is the last one.	required
`is_first`	`bool`	Whether the message is the first one.	required
`system_prompt`	`Optional[str]`	The system prompt.	`None`
`force_img_first`	`bool`	Whether to force the image to be first.	`False`

Returns:

Type	Description
`Tuple[List[int], List[ndarray], List[Audio]]`	The encoded tokens and images.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def encode_user_message(
    self,
    message: UserMessage,
    available_tools: Optional[List[Tool]],
    is_last: bool,
    is_first: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray], List[Audio]]:
    r"""Encode a user message.

    Args:
        message: The user message to encode.
        available_tools: The available tools.
        is_last: Whether the message is the last one.
        is_first: Whether the message is the first one.
        system_prompt: The system prompt.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and images.
    """
    ...

`SpecialTokenPolicy`

Bases: int, Enum

What to do with special tokens when encoding/decoding.

Attributes:

Name	Type	Description
`IGNORE`		Ignore special tokens.
`KEEP`		Keep special tokens.
`RAISE`		Raise an error if special tokens are found.

`SpecialTokens`

Bases: str, Enum

[DEPRECATED] Enum of special tokens used in the tokenizer.

Attributes:

Name	Type	Description
`unk`		The unknown token.
`bos`		The beginning of string token.
`eos`		The end of string token.
`begin_inst`		The beginning of instruction token.
`end_inst`		The end of instruction token.
`begin_tools`		The beginning of tools token.
`end_tools`		The end of tools token.
`begin_tool_results`		The beginning of tool results token.
`end_tool_results`		The end of tool results token.
`tool_calls`		The tool calls token.
`img`		The image token.
`pad`		The pad token.
`img_break`		The image break token.
`img_end`		The image end token.
`prefix`		The prefix token for FIM.
`middle`		The middle token for FIM.
`suffix`		The suffix token for FIM.
`begin_system`		The beginning of system prompt token.
`end_system`		The end of system prompt token.
`begin_tool_content`		The beginning of tool content token.

Examples:

>>> unk = SpecialTokens.unk

`Tokenized(**data)`

Bases: MistralBase

A tokenized InstructRequest.

Attributes:

Name	Type	Description
`tokens`	`List[int]`	The token ids.
`text`	`Optional[str]`	The text representation of the tokens.
`prefix_ids`	`Optional[List[int]]`	The prefix ids for FIM.
`images`	`List[ndarray]`	The loaded images associated with the tokens.

Examples:

>>> tokenized = Tokenized(tokens=[1, 2, 3], text="Hello world", prefix_ids=[1], images=[])

Source code in .venv/lib/python3.13/site-packages/pydantic/main.py

def __init__(self, /, **data: Any) -> None:
    """Create a new model by parsing and validating input data from keyword arguments.

    Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be
    validated to form a valid model.

    `self` is explicitly positional-only to allow `self` as a field name.
    """
    # `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks
    __tracebackhide__ = True
    validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
    if self is not validated_self:
        warnings.warn(
            'A custom validator is returning a value other than `self`.\n'
            "Returning anything other than `self` from a top level model validator isn't supported when validating via `__init__`.\n"
            'See the `model_validator` docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.',
            stacklevel=2,
        )

`Tokenizer`

Bases: ABC

`bos_id` `abstractmethod` `property`

id of the Beginning of String token.

`eos_id` `abstractmethod` `property`

id of the End of String token.

`file_path` `abstractmethod` `property`

The file path of the tokenizer.

`n_words` `abstractmethod` `property`

Vocabulary size of the tokenizer.

`pad_id` `abstractmethod` `property`

id of the Pad token.

`unk_id` `abstractmethod` `property`

id of the Unk token.

`version` `abstractmethod` `property`

Get the version of the tokenizer.

`decode(tokens, special_token_policy=None)` `abstractmethod`

Decode the token ids to a string.

Parameters:

Name	Type	Description	Default
`tokens`	`List[int]`	The token ids to decode.	required
`special_token_policy`	`Optional[SpecialTokenPolicy]`	The policy to use for special tokens. Passing `None` will default to `self._special_token_policy` for Tekkenizer and `SpecialTokenPolicy.IGNORE` for SentencePieceTokenizer. Note that passing `None` will be deprecated and `special_token_policy` will default to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.	`None`

Returns:

Type	Description
`str`	The decoded string.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Decode the token ids to a string.

    Args:
        tokens: The token ids to decode.
        special_token_policy: The policy to use for special tokens.
            Passing `None` will default to `self._special_token_policy` for
            [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
            for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
            Note that passing `None` will be deprecated and `special_token_policy` will default to
            `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """

`encode(s, bos, eos)` `abstractmethod`

Convert a string to a list of token ids.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
    """Convert a string to a list of token ids."""

`get_control_token(s)` `abstractmethod`

Get the id of a control token.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def get_control_token(self, s: str) -> int:
    r"""Get the id of a control token."""

`id_to_piece(token_id)` `abstractmethod`

Convert a token id to the token str.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def id_to_piece(self, token_id: int) -> str:
    r"""Convert a token id to the token str."""

`to_string(tokens)` `abstractmethod`

[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

Use decode with special_token_policy=SpecialTokenPolicy.KEEP instead.

This is a convenient method for debugging.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def to_string(self, tokens: List[int]) -> str:
    r"""[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

    Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

    This is a convenient method for debugging.
    """
    ...

`vocab()` `abstractmethod`

All tokens in the vocabulary as strings.

Source code in src/mistral_common/tokens/tokenizers/base.py

@abstractmethod
def vocab(self) -> List[str]:
    r"""All tokens in the vocabulary as strings."""

`TokenizerVersion`

Bases: str, Enum

Enum of tokenizer versions.

Allow to distinguish between different versions of the tokenizer and maintain backward compatibility.

Attributes:

Name	Type	Description
`v1`		The first version of the tokenizer.
`v2`		The second version of the tokenizer that includes special control tokens [INST], [\INST].
`v3`		The third version of the tokenizer that includes improved function calling.
`v7`		The seventh version of the tokenizer that includes improved system prompt and function calling.
`v11`		The eleventh version of the tokenizer that includes improved function calling.
`v13`		The thirteenth version of the tokenizer that includes no call id tokenization and better prompt caching.

Examples:

>>> version = TokenizerVersion.v1

`UserMessagePosition`

Bases: str, Enum

Where to encode available tools

mistral_common.tokens.tokenizers.base

InstructTokenizer(tokenizer, image_encoder, audio_encoder)

decode(tokens, special_token_policy=None) abstractmethod

encode_fim(request) abstractmethod

encode_instruct(request) abstractmethod

encode_transcription(request) abstractmethod

encode_user_content(content, is_last, system_prompt=None, force_img_first=False) abstractmethod

encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False) abstractmethod

SpecialTokenPolicy

SpecialTokens

Tokenized(**data)

Tokenizer

bos_id abstractmethod property

eos_id abstractmethod property

file_path abstractmethod property

n_words abstractmethod property

pad_id abstractmethod property

unk_id abstractmethod property

version abstractmethod property

decode(tokens, special_token_policy=None) abstractmethod

encode(s, bos, eos) abstractmethod

get_control_token(s) abstractmethod

id_to_piece(token_id) abstractmethod

to_string(tokens) abstractmethod

vocab() abstractmethod

TokenizerVersion

UserMessagePosition

`mistral_common.tokens.tokenizers.base`

`InstructTokenizer(tokenizer, image_encoder, audio_encoder)`

`decode(tokens, special_token_policy=None)` `abstractmethod`

`encode_fim(request)` `abstractmethod`

`encode_instruct(request)` `abstractmethod`

`encode_transcription(request)` `abstractmethod`

`encode_user_content(content, is_last, system_prompt=None, force_img_first=False)` `abstractmethod`

`encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)` `abstractmethod`

`SpecialTokenPolicy`

`SpecialTokens`

`Tokenized(**data)`

`Tokenizer`

`bos_id` `abstractmethod` `property`

`eos_id` `abstractmethod` `property`

`file_path` `abstractmethod` `property`

`n_words` `abstractmethod` `property`

`pad_id` `abstractmethod` `property`

`unk_id` `abstractmethod` `property`

`version` `abstractmethod` `property`

`decode(tokens, special_token_policy=None)` `abstractmethod`

`encode(s, bos, eos)` `abstractmethod`

`get_control_token(s)` `abstractmethod`

`id_to_piece(token_id)` `abstractmethod`

`to_string(tokens)` `abstractmethod`

`vocab()` `abstractmethod`

`TokenizerVersion`

`UserMessagePosition`