Skip to content

mistral_common.tokens.tokenizers.instruct

InstructTokenizerBase(tokenizer, image_encoder=None, audio_encoder=None)

Bases: InstructTokenizer, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Base instruct tokenizer.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
image_encoder Optional[ImageEncoder]

The image encoder to use if any.

None
audio_encoder Optional[AudioEncoder]

The audio encoder to use.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(
    self,
    tokenizer: Tokenizer,
    image_encoder: Optional[ImageEncoder] = None,
    audio_encoder: Optional[AudioEncoder] = None,
):
    r"""Initialize the instruct tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use if any.
        audio_encoder: The audio encoder to use.
    """
    self.tokenizer = tokenizer
    self.image_encoder = image_encoder
    self.audio_encoder = audio_encoder
    super().__init__(tokenizer, image_encoder, audio_encoder)

decode(tokens, special_token_policy=None)

Decode tokens to a string.

Parameters:

Name Type Description Default
tokens List[int]

The tokens to decode.

required
special_token_policy Optional[SpecialTokenPolicy]

The policy to use for special tokens. Passing None will default to self._special_token_policy for Tekkenizer and SpecialTokenPolicy.IGNORE for SentencePieceTokenizer. Note that passing None will be deprecated and special_token_policy will default to SpecialTokenPolicy.IGNORE in mistral_common=1.10.0.

None

Returns:

Type Description
str

The decoded string.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Decode tokens to a string.

    Args:
        tokens: The tokens to decode.
        special_token_policy: The policy to use for special tokens.
            Passing `None` will default to `self._special_token_policy` for
            [Tekkenizer][mistral_common.tokens.tokenizers.tekken.Tekkenizer] and `SpecialTokenPolicy.IGNORE`
            for [SentencePieceTokenizer][mistral_common.tokens.tokenizers.sentencepiece.SentencePieceTokenizer].
            Note that passing `None` will be deprecated and `special_token_policy` will default to
            `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """
    return self.tokenizer.decode(tokens, special_token_policy=special_token_policy)

encode_assistant_message(message, is_before_last_user_message, continue_message) abstractmethod

Encode an assistant message.

Raises:

Type Description
NotImplementedError

The assistant message is not implemented for the base tokenizer.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
@abstractmethod
def encode_assistant_message(
    self, message: AssistantMessageType, is_before_last_user_message: bool, continue_message: bool
) -> List[int]:
    r"""Encode an assistant message.

    Raises:
        NotImplementedError: The assistant message is not implemented for the base tokenizer.
    """
    raise NotImplementedError("Assistant message not implemented")

encode_instruct(request)

Encode an instruct request.

Parameters:

Name Type Description Default
request InstructRequest[AssistantMessageType, Tool]

The request to encode.

required

Returns:

Type Description
Tokenized

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_instruct(
    self,
    request: InstructRequest[AssistantMessageType, Tool],
) -> Tokenized:
    r"""Encode an instruct request.

    Args:
        request: The request to encode.

    Returns:
        The encoded tokens.
    """
    # init at bos
    images: List[np.ndarray] = []
    audios: List[Audio] = []
    prefix_ids: Optional[List[int]] = None
    tokens_list: List[Optional[List[int]]] = []

    # validate messages
    self.validate_messages(request.messages)

    # find last user message
    first_user_idx, last_user_idx = self.find_first_last_user(request)
    for msg_idx, msg in enumerate(request.messages):
        if (
            request.continue_final_message
            and (msg_idx == len(request.messages) - 1)
            and not isinstance(msg, AssistantMessage)
        ):
            raise InvalidMessageStructureException(
                "Cannot continue final message if it is not an assistant message"
            )
        if isinstance(msg, UserMessage):
            new_tokens, new_images, new_audios = self.encode_user_message(
                msg,
                request.available_tools,
                msg_idx == last_user_idx,
                msg_idx == first_user_idx,
                system_prompt=request.system_prompt,
                force_img_first=True,  # img is always first when providing text/img chunk pair
            )
            images.extend(new_images)
            audios.extend(new_audios)
        elif isinstance(msg, ToolMessage):
            new_tokens = self.encode_tool_message(msg, msg_idx < last_user_idx)
        elif isinstance(msg, AssistantMessage):
            continue_message = request.continue_final_message and (msg_idx == len(request.messages) - 1)

            new_tokens = self.encode_assistant_message(
                msg, msg_idx < last_user_idx, continue_message=continue_message
            )
            if msg_idx == len(request.messages) - 1:
                prefix_ids = new_tokens
        elif isinstance(msg, SystemMessage):
            new_tokens = self.encode_system_message(msg)
        else:
            raise TokenizerException(f"Unknown message type {type(msg)}")

        tokens_list.append(new_tokens)

    if request.truncate_at_max_tokens is not None:
        self._truncate_for_max_tokens(
            tokens_list,
            request.messages,
            request.truncate_at_max_tokens,
            last_user_idx,
        )
    tokens = self.start()

    for tok in tokens_list:
        if tok is not None:
            tokens.extend(tok)

    return Tokenized(
        tokens=tokens,
        text=self.decode(tokens, special_token_policy=SpecialTokenPolicy.KEEP),
        prefix_ids=prefix_ids,
        images=images,
        audios=audios,
    )

encode_tool_message(message, is_before_last_user_message) abstractmethod

Encode a tool message.

Raises:

Type Description
NotImplementedError

The tool message is not implemented for the base tokenizer.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
@abstractmethod
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Raises:
        NotImplementedError: The tool message is not implemented for the base tokenizer.
    """
    raise NotImplementedError("Tool message not implemented")

find_first_last_user(request) staticmethod

Find the first and last user message in the request.

Parameters:

Name Type Description Default
request InstructRequest

The request to search for user messages.

required

Returns:

Type Description
Tuple[int, int]

The index of the first and last user message.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
@staticmethod
def find_first_last_user(request: InstructRequest) -> Tuple[int, int]:
    r"""Find the first and last user message in the request.

    Args:
        request: The request to search for user messages.

    Returns:
        The index of the first and last user message.
    """
    last_user_idx = -1
    first_user_idx = -1
    for i, msg in list(enumerate(request.messages)):
        if isinstance(msg, UserMessage):
            if first_user_idx == -1:
                first_user_idx = i
            last_user_idx = i
    return first_user_idx, last_user_idx

start()

Return the start tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def start(self) -> List[int]:
    r"""Return the start tokens."""
    return [self.tokenizer.bos_id]

InstructTokenizerV1(tokenizer, image_encoder=None, audio_encoder=None)

Bases: InstructTokenizerBase, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Instruct tokenizer V1.

This tokenizer has basic for messages. It does not support tools or image inputs.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(
    self,
    tokenizer: Tokenizer,
    image_encoder: Optional[ImageEncoder] = None,
    audio_encoder: Optional[AudioEncoder] = None,
):
    r"""Initialize the instruct tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use if any.
        audio_encoder: The audio encoder to use.
    """
    self.tokenizer = tokenizer
    self.image_encoder = image_encoder
    self.audio_encoder = audio_encoder
    super().__init__(tokenizer, image_encoder, audio_encoder)

encode_assistant_message(message, is_before_last_user_message, continue_message)

Encode an assistant message.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Not used.

required
continue_message bool

Whether to continue the message generation. Only use this if the assistant message is the last message.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(
    self, message: AssistantMessageType, is_before_last_user_message: bool, continue_message: bool
) -> List[int]:
    r"""Encode an assistant message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.
        continue_message: Whether to continue the message generation.
            Only use this if the assistant message is the last message.

    Returns:
        The encoded tokens.
    """
    assert isinstance(message, AssistantMessage), message
    if message.tool_calls is not None and len(message.tool_calls) > 0:
        raise TokenizerException("Tools not implemented for tokenizer V1")
    if continue_message and message.prefix:
        raise InvalidAssistantMessageException(
            "`continue_message` is only supported for assistant messages that have `prefix=False`."
        )
    elif message.content:
        curr_tokens = self.tokenizer.encode(message.content, bos=False, eos=False)
    else:
        raise TokenizerException(f"{message.content} // {message.tool_calls}")
    if not message.prefix and not continue_message:
        curr_tokens.append(self.tokenizer.eos_id)
    return curr_tokens

encode_fim(request)

Encode a FIM request.

Raises:

Type Description
TokenizerException

The FIM request is not implemented for this version.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_fim(self, request: FIMRequest) -> Tokenized:
    r"""Encode a FIM request.

    Raises:
       TokenizerException: The FIM request is not implemented for this version.
    """
    raise TokenizerException(f"FIM not available for {self.tokenizer.version}")

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Raises:

Type Description
TokenizerException

The tool message is not implemented for this version.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Raises:
        TokenizerException: The tool message is not implemented for this version.
    """
    raise TokenizerException("Tools not implemented for tokenizer V1")

encode_user_content(content, is_last, system_prompt=None, force_img_first=False)

Encode a user content.

Parameters:

Name Type Description Default
content Union[str, List[ContentChunk]]

The content to encode.

required
is_last bool

Whether the message is the last one.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Not used.

False

Returns:

Type Description
Tuple[List[int], List[ndarray], List[Audio]]

The encoded tokens and empty list.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_content(
    self,
    content: Union[str, List[ContentChunk]],
    is_last: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray], List[Audio]]:
    r"""Encode a user content.

    Args:
        content: The content to encode.
        is_last: Whether the message is the last one.
        system_prompt: The system prompt.
        force_img_first: Not used.

    Returns:
        The encoded tokens and empty list.
    """
    assert isinstance(content, str)

    if is_last and system_prompt:
        content = system_prompt + "\n\n" + content

    tokens = self.tokenizer.encode(content, bos=False, eos=False)
    return tokens, [], []

encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)

Encode a user message.

Parameters:

Name Type Description Default
message UserMessage

The message to encode.

required
available_tools Optional[List[Tool]]

Not used.

required
is_last bool

Not used.

required
is_first bool

Whether the message is the first one.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Not used.

False

Returns:

Type Description
Tuple[List[int], List[ndarray], List[Audio]]

The encoded tokens and empty list.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_message(
    self,
    message: UserMessage,
    available_tools: Optional[List[Tool]],
    is_last: bool,
    is_first: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray], List[Audio]]:
    r"""Encode a user message.

    Args:
        message: The message to encode.
        available_tools: Not used.
        is_last: Not used.
        is_first: Whether the message is the first one.
        system_prompt: The system prompt.
        force_img_first: Not used.

    Returns:
        The encoded tokens and empty list.
    """
    assert isinstance(message.content, str), "Message content must be normalized"
    assert self.image_encoder is None, "InstructTokenizerV1 cannot encode images"

    content = ""
    if is_first and system_prompt:
        content = system_prompt + "\n\n" + message.content
    else:
        content = message.content

    message_txt = f"[INST] {content} [/INST]"
    curr_tokens, image, audio = self.encode_user_content(content=message_txt, is_last=False, system_prompt=None)
    return curr_tokens, image, audio

InstructTokenizerV11(tokenizer, image_encoder=None, audio_encoder=None)

Bases: InstructTokenizerV7

Instruct tokenizer V11.

The difference with V7 tokenizer is that it encodes tool calls differently: Tool call results are encoded as : - [begin tool call] call_name_tokens [call id] call_id_tokens [args] content tokens

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(
    self,
    tokenizer: Tokenizer,
    image_encoder: Optional[ImageEncoder] = None,
    audio_encoder: Optional[AudioEncoder] = None,
) -> None:
    super().__init__(tokenizer, image_encoder, audio_encoder)
    self.ARGS = self.tokenizer.get_control_token(SpecialTokens.args.value)
    self.CALL_ID = self.tokenizer.get_control_token(SpecialTokens.call_id.value)

InstructTokenizerV13(tokenizer, image_encoder=None, audio_encoder=None)

Bases: InstructTokenizerV11

Instruct tokenizer V13.

The difference with V11 tokenizer is that it encodes tool calls differently
  • available tools are tokenized at the first user message.
  • call id is no longer tokenized for tool calls or results.
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(
    self,
    tokenizer: Tokenizer,
    image_encoder: Optional[ImageEncoder] = None,
    audio_encoder: Optional[AudioEncoder] = None,
) -> None:
    super().__init__(tokenizer, image_encoder, audio_encoder)
    self.ARGS = self.tokenizer.get_control_token(SpecialTokens.args.value)
    self.CALL_ID = self.tokenizer.get_control_token(SpecialTokens.call_id.value)

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Parameters:

Name Type Description Default
message ToolMessage

The message to encode.

required
is_before_last_user_message bool

Not used.

required

Returns: The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.
    Returns:
        The encoded tokens.
    """
    assert message.tool_call_id is not None, "Tool call id must be provided for tokenizer >= v13"

    tokens = self.tokenizer.encode(message.content, bos=False, eos=False)
    curr_tokens = [
        self.BEGIN_TOOL_RESULTS,
        *tokens,
        self.END_TOOL_RESULTS,
    ]
    return curr_tokens

InstructTokenizerV2(tokenizer, image_encoder=None, audio_encoder=None)

Bases: InstructTokenizerV1, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Instruct tokenizer V2.

This tokenizer adds supports to images, tools and FIM requests.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
image_encoder Optional[ImageEncoder]

The image encoder to use.

None
audio_encoder Optional[AudioEncoder]

The audio encoder to use.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(
    self,
    tokenizer: Tokenizer,
    image_encoder: Optional[ImageEncoder] = None,
    audio_encoder: Optional[AudioEncoder] = None,
):
    r"""Initialize the tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use.
        audio_encoder: The audio encoder to use.
    """
    super().__init__(tokenizer, image_encoder, audio_encoder)
    self.BEGIN_INST = self.tokenizer.get_control_token(SpecialTokens.begin_inst.value)
    self.END_INST = self.tokenizer.get_control_token(SpecialTokens.end_inst.value)
    self.BEGIN_AVAILABLE_TOOLS = self.tokenizer.get_control_token(SpecialTokens.begin_tools.value)
    self.END_AVAILABLE_TOOLS = self.tokenizer.get_control_token(SpecialTokens.end_tools.value)
    self.BEGIN_TOOL_RESULTS = self.tokenizer.get_control_token(SpecialTokens.begin_tool_results.value)
    self.END_TOOL_RESULTS = self.tokenizer.get_control_token(SpecialTokens.end_tool_results.value)
    self.TOOL_CALLS = self.tokenizer.get_control_token(SpecialTokens.tool_calls.value)
    self.BOS = self.tokenizer.get_control_token(SpecialTokens.bos.value)
    self.PREFIX = self.tokenizer.get_control_token(SpecialTokens.prefix.value)
    self.SUFFIX = self.tokenizer.get_control_token(SpecialTokens.suffix.value)

encode_assistant_message(message, is_before_last_user_message, continue_message)

Encode an assistant message.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Whether the message is before the last user message. If has tools and true, the message is not encoded.

required
continue_message bool

Whether to continue the message generation. Only use this if the assistant message is the last message.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(
    self, message: AssistantMessageType, is_before_last_user_message: bool, continue_message: bool
) -> List[int]:
    r"""Encode an assistant message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Whether the message is before the last user message. If has tools and true, the
            message is not encoded.
        continue_message: Whether to continue the message generation.
            Only use this if the assistant message is the last message.

    Returns:
        The encoded tokens.
    """
    if message.tool_calls and message.content:
        raise ValueError(f"Cannot have tool calls and content defined in the same assistant message {message}")
    if continue_message and message.prefix:
        raise InvalidAssistantMessageException(
            "`continue_message` is only supported for assistant messages that have `prefix=False`."
        )

    if message.tool_calls:
        if is_before_last_user_message:
            # don't tokenize tool call before last user message
            return []
        curr_tokens = self._encode_tool_calls_in_assistant_message(message)
    elif message.content:
        curr_tokens = self._encode_normal_content_assistant_message(message)
    else:
        raise TokenizerException(f"Invalid assistant message: {message.content}")
    if not message.prefix and not continue_message:
        curr_tokens.append(self.tokenizer.eos_id)
    return curr_tokens

encode_fim(request)

Encode a FIM request.

Parameters:

Name Type Description Default
request FIMRequest

The request to encode.

required

Returns:

Type Description
Tokenized

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_fim(self, request: FIMRequest) -> Tokenized:
    r"""Encode a FIM request.

    Args:
        request: The request to encode.

    Returns:
        The encoded tokens.
    """
    prefix_tokens = self.tokenizer.encode(request.prompt, bos=False, eos=False)
    suffix_tokens = self._encode_infilling(request.suffix) if request.suffix else []
    tokens = [
        self.BOS,
        self.SUFFIX,
        *suffix_tokens,
        self.PREFIX,
        *prefix_tokens,
    ]
    return Tokenized(tokens=tokens, text=self.decode(tokens, special_token_policy=SpecialTokenPolicy.KEEP))

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Parameters:

Name Type Description Default
message ToolMessage

The message to encode.

required
is_before_last_user_message bool

Whether the message is before the last user message. If true, the message is not encoded.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Whether the message is before the last user message. If true, the message is
            not encoded.

    Returns:
        The encoded tokens.
    """
    if is_before_last_user_message:
        # don't tokenize last tool response before last user msg
        return []

    # Currently only supports single tool results
    tool_result_str = json.dumps([self._prepare_tool_result(message)], ensure_ascii=False)
    curr_tokens = [
        self.BEGIN_TOOL_RESULTS,
        *self.tokenizer.encode(tool_result_str, bos=False, eos=False),
        self.END_TOOL_RESULTS,
    ]
    return curr_tokens

encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)

Encode a user message.

Parameters:

Name Type Description Default
message UserMessage

The message to encode.

required
available_tools Optional[List[Tool]]

The list of available tools if any.

required
is_last bool

Whether the message is the last one.

required
is_first bool

Not used.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Whether to force the image to be first.

False

Returns:

Type Description
Tuple[List[int], List[ndarray], List[Audio]]

The encoded tokens and the list of images.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_message(
    self,
    message: UserMessage,
    available_tools: Optional[List[Tool]],
    is_last: bool,
    is_first: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray], List[Audio]]:
    r"""Encode a user message.

    Args:
        message: The message to encode.
        available_tools: The list of available tools if any.
        is_last: Whether the message is the last one.
        is_first: Not used.
        system_prompt: The system prompt.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and the list of images.
    """
    do_encode_tools = False
    do_encode_tools |= is_first and (self._user_message_position_to_encode_tools == UserMessagePosition.first)
    do_encode_tools |= is_last and (self._user_message_position_to_encode_tools == UserMessagePosition.last)
    tools_tokens: List[int] = []

    if do_encode_tools and available_tools:
        tools = [tool.model_dump() for tool in available_tools]
        tools_json_tokens = self.tokenizer.encode(json.dumps(tools, ensure_ascii=False), bos=False, eos=False)
        tools_tokens = [
            self.BEGIN_AVAILABLE_TOOLS,
            *tools_json_tokens,
            self.END_AVAILABLE_TOOLS,
        ]

    tokens, image, audio = self.encode_user_content(
        content=message.content,
        is_last=is_last,
        system_prompt=system_prompt,
        force_img_first=force_img_first,
    )

    prefix_tokens = [*tools_tokens, self.BEGIN_INST]
    suffix_tokens = [self.END_INST]

    curr_tokens = prefix_tokens + tokens + suffix_tokens

    return curr_tokens, image, audio

InstructTokenizerV3(tokenizer, image_encoder=None, audio_encoder=None)

Bases: InstructTokenizerV2, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Instruct tokenizer V3.

The only difference with V2 tokenizer is that it encodes the tool messages differently.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
image_encoder Optional[ImageEncoder]

The image encoder to use.

None
audio_encoder Optional[AudioEncoder]

The audio encoder to use.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(
    self,
    tokenizer: Tokenizer,
    image_encoder: Optional[ImageEncoder] = None,
    audio_encoder: Optional[AudioEncoder] = None,
):
    r"""Initialize the tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use.
        audio_encoder: The audio encoder to use.
    """
    super().__init__(tokenizer, image_encoder=image_encoder, audio_encoder=audio_encoder)

encode_assistant_message(message, is_before_last_user_message, continue_message)

Encode an assistant message.

Note

Same as V2 but always encode the tool history. continue_message: Whether to continue the message generation. Only use this if the assistant message is the last message.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Not used.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(
    self, message: AssistantMessageType, is_before_last_user_message: bool, continue_message: bool
) -> List[int]:
    r"""Encode an assistant message.

    Note:
        Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_assistant_message] but
        always encode the tool history.
        continue_message: Whether to continue the message generation.
            Only use this if the assistant message is the last message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.

    Returns:
        The encoded tokens.
    """
    return super().encode_assistant_message(message, False, continue_message)

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Note

Same as V2 but tools are not wrapped in a list and the history is also tokenized.

Parameters:

Name Type Description Default
message ToolMessage

The message to encode.

required
is_before_last_user_message bool

Whether the message is before the last user message. If true, the message is not encoded.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Note:
        Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_tool_message] but tools
        are not wrapped in a list and the history is also tokenized.

    Args:
        message: The message to encode.
        is_before_last_user_message: Whether the message is before the last user message. If true, the message is
            not encoded.

    Returns:
        The encoded tokens.
    """
    tool_result_str = json.dumps(self._prepare_tool_result(message), ensure_ascii=False)
    curr_tokens = [
        self.BEGIN_TOOL_RESULTS,
        *self.tokenizer.encode(tool_result_str, bos=False, eos=False),
        self.END_TOOL_RESULTS,
    ]
    return curr_tokens

encode_user_content(content, is_last, system_prompt=None, force_img_first=False)

Encode a user content.

Parameters:

Name Type Description Default
content Union[str, List[ContentChunk]]

The content to encode.

required
is_last bool

Whether the message is the last one.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Whether to force the image to be first.

False

Returns:

Type Description
Tuple[List[int], List[ndarray], List[Audio]]

The encoded tokens and the images.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_content(
    self,
    content: Union[str, List[ContentChunk]],
    is_last: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray], List[Audio]]:
    r"""Encode a user content.

    Args:
        content: The content to encode.
        is_last: Whether the message is the last one.
        system_prompt: The system prompt.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and the images.
    """
    if isinstance(content, str):
        return super().encode_user_content(content, is_last, system_prompt)

    tokens: List[int] = []
    images: List[np.ndarray] = []
    audio: List[Audio] = []

    has_one_img_one_text_first = len(content) == 2 and isinstance(content[1], (ImageChunk, ImageURLChunk))
    if force_img_first and has_one_img_one_text_first:
        # make sure that if exactly one image and text chunk are passed we force the image chunk to be first
        content = [content[1], content[0]]

    first_chunk = True
    for chunk in content:
        content_str = ""
        if first_chunk and is_last and system_prompt:
            first_chunk = False
            content_str = system_prompt + "\n\n"

        if isinstance(chunk, TextChunk):
            content_str += chunk.text
            tokens.extend(self.tokenizer.encode(content_str, bos=False, eos=False))
        elif isinstance(chunk, (ImageChunk, ImageURLChunk)):
            assert self.image_encoder is not None, "Make sure to define a image encoder at init"
            if content_str:
                tokens.extend(self.tokenizer.encode(content_str, bos=False, eos=False))

            img_encoding = self.image_encoder(chunk)

            tokens.extend(img_encoding.tokens)
            images.append(img_encoding.image)
        elif isinstance(chunk, (AudioChunk, AudioURLChunk)):
            assert not content_str, (
                f"It is not possible that `content` is non-empty when chunk is of type {type(chunk)}."
            )
            # the following is only possible for >= v7
            assert self.audio_encoder is not None, "Make sure to define a audio encoder at init"
            audio_encoding = self.audio_encoder(chunk)

            tokens.extend(audio_encoding.tokens)
            audio.append(audio_encoding.audio)
        else:
            raise ValueError(f"Unknown chunk type: {chunk}")

    return tokens, images, audio

InstructTokenizerV7(tokenizer, image_encoder=None, audio_encoder=None)

Bases: InstructTokenizerV3

Instruct tokenizer V7.

The difference with V3 tokenizer is that it encodes the system prompts differently: - in V7 the system prompts are treated as separate SystemMessages - they are no longer prepended to the last user message - they are printed between special tokens

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
image_encoder Optional[ImageEncoder]

The image encoder to use.

None
audio_encoder Optional[AudioEncoder]

The audio encoder to use.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(
    self,
    tokenizer: Tokenizer,
    image_encoder: Optional[ImageEncoder] = None,
    audio_encoder: Optional[AudioEncoder] = None,
) -> None:
    r"""Initialize the tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        image_encoder: The image encoder to use.
        audio_encoder: The audio encoder to use.
    """

    super().__init__(tokenizer, image_encoder, audio_encoder)
    self.BEGIN_SYSTEM = self.tokenizer.get_control_token(SpecialTokens.begin_system.value)
    self.END_SYSTEM = self.tokenizer.get_control_token(SpecialTokens.end_system.value)
    self.BEGIN_TOOL_CONTENT = self.tokenizer.get_control_token(SpecialTokens.begin_tool_content.value)

    self.TRANSCRIBE = None
    if audio_encoder is not None:
        self.TRANSCRIBE = self.tokenizer.get_control_token(SpecialTokens.transcribe.value)

encode_assistant_message(message, is_before_last_user_message, continue_message)

Encode an assistant message.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Not used.

required
continue_message bool

Whether to continue the message generation. Only use this if the assistant message is the last message.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(
    self, message: AssistantMessageType, is_before_last_user_message: bool, continue_message: bool
) -> List[int]:
    r"""Encode an assistant message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.
        continue_message: Whether to continue the message generation.
            Only use this if the assistant message is the last message.

    Returns:
        The encoded tokens.
    """
    if not message.content and not message.tool_calls:
        raise TokenizerException(f"Invalid assistant message: {message}")
    if continue_message and message.prefix:
        raise InvalidAssistantMessageException(
            "`continue_message` is only supported for assistant messages that have `prefix=False`."
        )

    curr_tokens: list = []
    if message.content:
        assert isinstance(message.content, str), f"Message content must be a string. Got {message.content}"
        curr_tokens += self._encode_normal_content_assistant_message(message)
    if message.tool_calls:
        curr_tokens += self._encode_tool_calls_in_assistant_message(message)
    if not message.prefix and not continue_message:
        curr_tokens.append(self.tokenizer.eos_id)

    return curr_tokens

encode_system_message(message)

Encode a system message.

Parameters:

Name Type Description Default
message SystemMessage

The message to encode.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_system_message(self, message: SystemMessage) -> List[int]:
    r"""Encode a system message.

    Args:
        message: The message to encode.

    Returns:
        The encoded tokens.
    """
    tokens = [
        self.BEGIN_SYSTEM,
        *self.tokenizer.encode(message.content, bos=False, eos=False),
        self.END_SYSTEM,
    ]
    return tokens

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Note

Same as V3 but tools are not wrapped in a list and history is also tokenized

Parameters:

Name Type Description Default
message ToolMessage

The message to encode.

required
is_before_last_user_message bool

Not used.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Note:
        Same as [V3][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV3.encode_tool_message]
        but tools are not wrapped in a list and history is also tokenized

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.

    Returns:
        The encoded tokens.
    """
    assert message.tool_call_id is not None
    assert isinstance(message.content, str), "Message content must be normalized"
    tool_call_id_tokens = self.tokenizer.encode(message.tool_call_id, bos=False, eos=False)
    tokens = self.tokenizer.encode(message.content, bos=False, eos=False)

    prefix_tokens = [
        self.BEGIN_TOOL_RESULTS,
        *tool_call_id_tokens,
        self.BEGIN_TOOL_CONTENT,
    ]
    curr_tokens = [
        *prefix_tokens,
        *tokens,
        self.END_TOOL_RESULTS,
    ]
    return curr_tokens

encode_transcription(request)

Encodes an audio transcription request into a tokenized format.

This method processes a transcription request containing audio data, encodes the user message, and returns the tokenized output.

Parameters:

Name Type Description Default
request TranscriptionRequest

The transcription request object containing the audio data to be encoded.

required

Returns:

Type Description
Tokenized

The tokenized representation of the audio data, including processed audio and tokens

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_transcription(self, request: TranscriptionRequest) -> Tokenized:
    r"""
    Encodes an audio transcription request into a tokenized format.

    This method processes a transcription request containing audio data,
    encodes the user message, and returns the tokenized output.

    Args:
        request: The transcription request object containing
            the audio data to be encoded.

    Returns:
        Tokenized: The tokenized representation of the audio data, including processed audio and tokens
    """

    assert self.TRANSCRIBE is not None, f"{self.__class__.__name__} needs to have a TRANSCRIBE token"
    prefix = self.start()
    tokens, _, audio = self.encode_user_message(
        UserMessage(content=[AudioChunk(input_audio=request.audio)]),
        available_tools=[],
        is_last=True,
        is_first=True,
        system_prompt=None,
    )

    tokens = [*prefix, *tokens]
    if request.language is not None:
        language_string = f"lang:{request.language}"  # no space.
        tokens += self.tokenizer.encode(language_string, bos=False, eos=False)

    tokens.append(self.TRANSCRIBE)
    return Tokenized(tokens=tokens, text=self.tokenizer._to_string(tokens), audios=audio)

encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)

Encode a user message.

Parameters:

Name Type Description Default
message UserMessage

The message to encode.

required
available_tools Optional[List[Tool]]

The list of available tools if any.

required
is_last bool

Whether the message is the last one.

required
is_first bool

Whether the message is the first one.

required
system_prompt Optional[str]

Not used.

None
force_img_first bool

Whether to force the image to be first.

False

Returns:

Type Description
Tuple[List[int], List[ndarray], List[Audio]]

The encoded tokens and the list of images.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_message(
    self,
    message: UserMessage,
    available_tools: Optional[List[Tool]],
    is_last: bool,
    is_first: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray], List[Audio]]:
    r"""Encode a user message.

    Args:
        message: The message to encode.
        available_tools: The list of available tools if any.
        is_last: Whether the message is the last one.
        is_first: Whether the message is the first one.
        system_prompt: Not used.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and the list of images.
    """
    assert system_prompt is None, "in Tokenizer V7 we don't encode system prompts in user messages"
    tokens, images, audio = super().encode_user_message(
        message,
        available_tools,
        is_last=is_last,
        is_first=is_first,
        system_prompt=None,
        force_img_first=force_img_first,
    )

    return tokens, images, audio