Skip to content

mistral_common.tokens.tokenizers.instruct

InstructTokenizerBase(tokenizer, mm_encoder=None)

Bases: InstructTokenizer, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Base instruct tokenizer.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
mm_encoder Optional[MultiModalEncoder]

The multi-modal encoder to use if any.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(self, tokenizer: Tokenizer, mm_encoder: Optional[MultiModalEncoder] = None):
    r"""Initialize the instruct tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        mm_encoder: The multi-modal encoder to use if any.
    """
    self.tokenizer = tokenizer
    self.mm_encoder = mm_encoder
    super().__init__(tokenizer, mm_encoder)

decode(tokens)

Decode tokens to a string.

Parameters:

Name Type Description Default
tokens List[int]

The tokens to decode.

required

Returns:

Type Description
str

The decoded string.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def decode(self, tokens: List[int]) -> str:
    r"""Decode tokens to a string.

    Args:
        tokens: The tokens to decode.

    Returns:
        The decoded string.
    """
    return self.tokenizer.decode(tokens)

encode_assistant_message(message, is_before_last_user_message) abstractmethod

Encode an assistant message.

Raises:

Type Description
NotImplementedError

The assistant message is not implemented for the base tokenizer.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
@abstractmethod
def encode_assistant_message(self, message: AssistantMessageType, is_before_last_user_message: bool) -> List[int]:
    r"""Encode an assistant message.

    Raises:
        NotImplementedError: The assistant message is not implemented for the base tokenizer.
    """
    raise NotImplementedError("Assistant message not implemented")

encode_instruct(request)

Encode an instruct request.

Parameters:

Name Type Description Default
request InstructRequest[AssistantMessageType, Tool]

The request to encode.

required

Returns:

Type Description
Tokenized

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_instruct(
    self,
    request: InstructRequest[AssistantMessageType, Tool],
) -> Tokenized:
    r"""Encode an instruct request.

    Args:
        request: The request to encode.

    Returns:
        The encoded tokens.
    """
    # init at bos
    images: List[np.ndarray] = []
    prefix_ids: Optional[List[int]] = None
    tokens_list: List[Optional[List[int]]] = []

    # find last user message
    first_user_idx, last_user_idx = self.find_first_last_user(request)
    for msg_idx, msg in enumerate(request.messages):
        if isinstance(msg, UserMessage):
            new_tokens, new_images = self.encode_user_message(
                msg,
                request.available_tools,
                msg_idx == last_user_idx,
                msg_idx == first_user_idx,
                system_prompt=request.system_prompt,
                force_img_first=True,  # img is always first when providing text/img chunk pair
            )
            images.extend(new_images)
        elif isinstance(msg, ToolMessage):
            new_tokens = self.encode_tool_message(msg, msg_idx < last_user_idx)
        elif isinstance(msg, AssistantMessage):
            new_tokens = self.encode_assistant_message(msg, msg_idx < last_user_idx)
            if msg_idx == len(request.messages) - 1:
                prefix_ids = new_tokens
        elif isinstance(msg, SystemMessage):
            new_tokens = self.encode_system_message(msg)

        tokens_list.append(new_tokens)

    if request.truncate_at_max_tokens is not None:
        self._truncate_for_max_tokens(
            tokens_list,
            request.messages,
            request.truncate_at_max_tokens,
            last_user_idx,
        )
    tokens = self.start()

    for tok in tokens_list:
        if tok is not None:
            tokens.extend(tok)

    return Tokenized(
        tokens=tokens,
        text=self.tokenizer.to_string(tokens),
        prefix_ids=prefix_ids,
        images=images,
    )

encode_tool_message(message, is_before_last_user_message) abstractmethod

Encode a tool message.

Raises:

Type Description
NotImplementedError

The tool message is not implemented for the base tokenizer.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
@abstractmethod
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Raises:
        NotImplementedError: The tool message is not implemented for the base tokenizer.
    """
    raise NotImplementedError("Tool message not implemented")

find_first_last_user(request) staticmethod

Find the first and last user message in the request.

Parameters:

Name Type Description Default
request InstructRequest

The request to search for user messages.

required

Returns:

Type Description
Tuple[int, int]

The index of the first and last user message.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
@staticmethod
def find_first_last_user(request: InstructRequest) -> Tuple[int, int]:
    r"""Find the first and last user message in the request.

    Args:
        request: The request to search for user messages.

    Returns:
        The index of the first and last user message.
    """
    last_user_idx = -1
    first_user_idx = -1
    for i, msg in list(enumerate(request.messages)):
        if isinstance(msg, UserMessage):
            if first_user_idx == -1:
                first_user_idx = i
            last_user_idx = i
    return first_user_idx, last_user_idx

function_call_prefix(tool_choice)

Return the function call prefix tokens.

Raises:

Type Description
NotImplementedError

The function call prefix is not implemented for the base tokenizer.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def function_call_prefix(self, tool_choice: ToolChoice) -> List[int]:
    r"""Return the function call prefix tokens.

    Raises:
        NotImplementedError: The function call prefix is not implemented for the base tokenizer.
    """
    raise NotImplementedError("Tool prefix not implemented")

start()

Return the start tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def start(self) -> List[int]:
    r"""Return the start tokens."""
    return [self.tokenizer.bos_id]

InstructTokenizerV1(tokenizer, mm_encoder=None)

Bases: InstructTokenizerBase, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Instruct tokenizer V1.

This tokenizer has basic for messages. It does not support tools or multi-modal inputs.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(self, tokenizer: Tokenizer, mm_encoder: Optional[MultiModalEncoder] = None):
    r"""Initialize the instruct tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        mm_encoder: The multi-modal encoder to use if any.
    """
    self.tokenizer = tokenizer
    self.mm_encoder = mm_encoder
    super().__init__(tokenizer, mm_encoder)

encode_assistant_message(message, is_before_last_user_message)

Encode an assistant message.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Not used.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(self, message: AssistantMessageType, is_before_last_user_message: bool) -> List[int]:
    r"""Encode an assistant message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.

    Returns:
        The encoded tokens.
    """
    assert isinstance(message, AssistantMessage), message
    if message.tool_calls is not None and len(message.tool_calls) > 0:
        raise TokenizerException("Tools not implemented for tokenizer V1")
    elif message.content:
        curr_tokens = self.tokenizer.encode(message.content, bos=False, eos=False)
    else:
        raise TokenizerException(f"{message.content} // {message.tool_calls}")
    if not message.prefix:
        curr_tokens.append(self.tokenizer.eos_id)
    return curr_tokens

encode_fim(request)

Encode a FIM request.

Raises:

Type Description
TokenizerException

The FIM request is not implemented for this version.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_fim(self, request: FIMRequest) -> Tokenized:
    r"""Encode a FIM request.

    Raises:
       TokenizerException: The FIM request is not implemented for this version.
    """
    raise TokenizerException("FIM not available for tokenizer V1")

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Raises:

Type Description
TokenizerException

The tool message is not implemented for this version.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Raises:
        TokenizerException: The tool message is not implemented for this version.
    """
    raise TokenizerException("Tools not implemented for tokenizer V1")

encode_user_content(content, is_last, system_prompt=None, force_img_first=False)

Encode a user content.

Parameters:

Name Type Description Default
content Union[str, List[ContentChunk]]

The content to encode.

required
is_last bool

Whether the message is the last one.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Not used.

False

Returns:

Type Description
Tuple[List[int], List[ndarray]]

The encoded tokens and empty list.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_content(
    self,
    content: Union[str, List[ContentChunk]],
    is_last: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray]]:
    r"""Encode a user content.

    Args:
        content: The content to encode.
        is_last: Whether the message is the last one.
        system_prompt: The system prompt.
        force_img_first: Not used.

    Returns:
        The encoded tokens and empty list.
    """
    assert isinstance(content, str)

    if is_last and system_prompt:
        content = system_prompt + "\n\n" + content

    tokens = self.tokenizer.encode(content, bos=False, eos=False)
    return tokens, []

encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)

Encode a user message.

Parameters:

Name Type Description Default
message UserMessage

The message to encode.

required
available_tools Optional[List[Tool]]

Not used.

required
is_last bool

Not used.

required
is_first bool

Whether the message is the first one.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Not used.

False

Returns:

Type Description
Tuple[List[int], List[ndarray]]

The encoded tokens and empty list.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_message(
    self,
    message: UserMessage,
    available_tools: Optional[List[Tool]],
    is_last: bool,
    is_first: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray]]:
    r"""Encode a user message.

    Args:
        message: The message to encode.
        available_tools: Not used.
        is_last: Not used.
        is_first: Whether the message is the first one.
        system_prompt: The system prompt.
        force_img_first: Not used.

    Returns:
        The encoded tokens and empty list.
    """
    assert message.content is not None
    assert isinstance(message.content, str), "Message content must be normalized"
    assert self.mm_encoder is None, "InstructTokenizerV1 cannot encode images"

    content = ""
    if is_first and system_prompt:
        content = system_prompt + "\n\n" + message.content
    else:
        content = message.content

    message_txt = f"[INST] {content} [/INST]"
    curr_tokens, image_tokens = self.encode_user_content(content=message_txt, is_last=False, system_prompt=None)
    return curr_tokens, image_tokens

InstructTokenizerV2(tokenizer, mm_encoder=None)

Bases: InstructTokenizerV1, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Instruct tokenizer V2.

This tokenizer adds supports to images, tools and FIM requests.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
mm_encoder Optional[MultiModalEncoder]

The multi-modal encoder to use.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(self, tokenizer: Tokenizer, mm_encoder: Optional[MultiModalEncoder] = None):
    r"""Initialize the tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        mm_encoder: The multi-modal encoder to use.
    """
    super().__init__(tokenizer, mm_encoder)
    self.BEGIN_INST = self.tokenizer.get_control_token(SpecialTokens.begin_inst.value)
    self.END_INST = self.tokenizer.get_control_token(SpecialTokens.end_inst.value)
    self.BEGIN_AVAILABLE_TOOLS = self.tokenizer.get_control_token(SpecialTokens.begin_tools.value)
    self.END_AVAILABLE_TOOLS = self.tokenizer.get_control_token(SpecialTokens.end_tools.value)
    self.BEGIN_TOOL_RESULTS = self.tokenizer.get_control_token(SpecialTokens.begin_tool_results.value)
    self.END_TOOL_RESULTS = self.tokenizer.get_control_token(SpecialTokens.end_tool_results.value)
    self.TOOL_CALLS = self.tokenizer.get_control_token(SpecialTokens.tool_calls.value)
    self.BOS = self.tokenizer.get_control_token(SpecialTokens.bos.value)
    self.PREFIX = self.tokenizer.get_control_token(SpecialTokens.prefix.value)
    self.SUFFIX = self.tokenizer.get_control_token(SpecialTokens.suffix.value)

encode_assistant_message(message, is_before_last_user_message)

Encode an assistant message.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Whether the message is before the last user message. If has tools and true, the message is not encoded.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(self, message: AssistantMessageType, is_before_last_user_message: bool) -> List[int]:
    r"""Encode an assistant message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Whether the message is before the last user message. If has tools and true, the
            message is not encoded.

    Returns:
        The encoded tokens.
    """
    if message.tool_calls and message.content:
        raise ValueError(f"Cannot have tool calls and content defined in the same assistant message {message}")

    if message.tool_calls:
        if is_before_last_user_message:
            # don't tokenize tool call before last user message
            return []
        curr_tokens = self._encode_tool_calls_in_assistant_message(message)
    elif message.content:
        curr_tokens = self._encode_normal_content_assistant_message(message)
    else:
        raise TokenizerException(f"Invalid assistant message: {message.content}")
    if not message.prefix:
        curr_tokens.append(self.tokenizer.eos_id)
    return curr_tokens

encode_fim(request)

Encode a FIM request.

Parameters:

Name Type Description Default
request FIMRequest

The request to encode.

required

Returns:

Type Description
Tokenized

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_fim(self, request: FIMRequest) -> Tokenized:
    r"""Encode a FIM request.

    Args:
        request: The request to encode.

    Returns:
        The encoded tokens.
    """
    prefix_tokens = self.tokenizer.encode(request.prompt, bos=False, eos=False)
    suffix_tokens = self._encode_infilling(request.suffix) if request.suffix else []
    tokens = [
        self.BOS,
        self.SUFFIX,
        *suffix_tokens,
        self.PREFIX,
        *prefix_tokens,
    ]
    return Tokenized(tokens=tokens, text=self.tokenizer.to_string(tokens))

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Parameters:

Name Type Description Default
message ToolMessage

The message to encode.

required
is_before_last_user_message bool

Whether the message is before the last user message. If true, the message is not encoded.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Whether the message is before the last user message. If true, the message is
            not encoded.

    Returns:
        The encoded tokens.
    """
    if is_before_last_user_message:
        # don't tokenize last tool response before last user msg
        return []

    # Currently only supports single tool results
    tool_result_str = json.dumps([self._prepare_tool_result(message)], ensure_ascii=False)
    curr_tokens = [
        self.BEGIN_TOOL_RESULTS,
        *self.tokenizer.encode(tool_result_str, bos=False, eos=False),
        self.END_TOOL_RESULTS,
    ]
    return curr_tokens

encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)

Encode a user message.

Parameters:

Name Type Description Default
message UserMessage

The message to encode.

required
available_tools Optional[List[Tool]]

The list of available tools if any.

required
is_last bool

Whether the message is the last one.

required
is_first bool

Not used.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Whether to force the image to be first.

False

Returns:

Type Description
Tuple[List[int], List[ndarray]]

The encoded tokens and the list of images.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_message(
    self,
    message: UserMessage,
    available_tools: Optional[List[Tool]],
    is_last: bool,
    is_first: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray]]:
    r"""Encode a user message.

    Args:
        message: The message to encode.
        available_tools: The list of available tools if any.
        is_last: Whether the message is the last one.
        is_first: Not used.
        system_prompt: The system prompt.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and the list of images.
    """
    assert message.content is not None
    tools_tokens: List[int] = []
    if is_last and available_tools:
        tools = [tool.model_dump() for tool in available_tools]
        tools_json_tokens = self.tokenizer.encode(json.dumps(tools, ensure_ascii=False), bos=False, eos=False)
        tools_tokens = [
            self.BEGIN_AVAILABLE_TOOLS,
            *tools_json_tokens,
            self.END_AVAILABLE_TOOLS,
        ]

    tokens, image_tokens = self.encode_user_content(
        content=message.content,
        is_last=is_last,
        system_prompt=system_prompt,
        force_img_first=force_img_first,
    )

    prefix_tokens = [*tools_tokens, self.BEGIN_INST]
    suffix_tokens = [self.END_INST]

    curr_tokens = prefix_tokens + tokens + suffix_tokens

    return curr_tokens, image_tokens

InstructTokenizerV3(tokenizer, mm_encoder=None)

Bases: InstructTokenizerV2, Generic[InstructRequestType, FIMRequestType, TokenizedType, AssistantMessageType]

Instruct tokenizer V3.

The only difference with V2 tokenizer is that it encodes the tool messages differently.

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
mm_encoder Optional[MultiModalEncoder]

The multi-modal encoder to use.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(self, tokenizer: Tokenizer, mm_encoder: Optional[MultiModalEncoder] = None) -> None:
    r"""Initialize the tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        mm_encoder: The multi-modal encoder to use.
    """
    super().__init__(tokenizer, mm_encoder=mm_encoder)

encode_assistant_message(message, is_before_last_user_message)

Encode an assistant message.

Note

Same as V2 but always encode the tool history.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Not used.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(self, message: AssistantMessageType, is_before_last_user_message: bool) -> List[int]:
    r"""Encode an assistant message.

    Note:
        Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_assistant_message] but
        always encode the tool history.

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.

    Returns:
        The encoded tokens.
    """
    return super().encode_assistant_message(message, False)

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Note

Same as V2 but tools are not wrapped in a list and the history is also tokenized.

Parameters:

Name Type Description Default
message ToolMessage

The message to encode.

required
is_before_last_user_message bool

Whether the message is before the last user message. If true, the message is not encoded.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Note:
        Same as [V2][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV2.encode_tool_message] but tools
        are not wrapped in a list and the history is also tokenized.

    Args:
        message: The message to encode.
        is_before_last_user_message: Whether the message is before the last user message. If true, the message is
            not encoded.

    Returns:
        The encoded tokens.
    """
    tool_result_str = json.dumps(self._prepare_tool_result(message), ensure_ascii=False)
    curr_tokens = [
        self.BEGIN_TOOL_RESULTS,
        *self.tokenizer.encode(tool_result_str, bos=False, eos=False),
        self.END_TOOL_RESULTS,
    ]
    return curr_tokens

encode_user_content(content, is_last, system_prompt=None, force_img_first=False)

Encode a user content.

Parameters:

Name Type Description Default
content Union[str, List[ContentChunk]]

The content to encode.

required
is_last bool

Whether the message is the last one.

required
system_prompt Optional[str]

The system prompt.

None
force_img_first bool

Whether to force the image to be first.

False

Returns:

Type Description
Tuple[List[int], List[ndarray]]

The encoded tokens and the images.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_content(
    self,
    content: Union[str, List[ContentChunk]],
    is_last: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray]]:
    r"""Encode a user content.

    Args:
        content: The content to encode.
        is_last: Whether the message is the last one.
        system_prompt: The system prompt.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and the images.
    """
    if isinstance(content, str):
        return super().encode_user_content(content, is_last, system_prompt)

    tokens: List[int] = []
    images: List[np.ndarray] = []

    has_one_img_one_text_first = (
        len(content) == 2 and isinstance(content[0], TextChunk) and not isinstance(content[1], TextChunk)
    )
    if force_img_first and has_one_img_one_text_first:
        # make sure that if exactly one image and text chunk are passed we force the image chunk to be first
        content = [content[1], content[0]]

    first_chunk = True
    for chunk in content:
        content = ""
        if first_chunk and is_last and system_prompt:
            first_chunk = False
            content = system_prompt + "\n\n"
        if isinstance(chunk, TextChunk):
            content += chunk.text
            tokens.extend(self.tokenizer.encode(content, bos=False, eos=False))
        else:
            assert self.mm_encoder is not None, "Make sure to define a multi-modal encoder at init"
            if content:
                tokens.extend(self.tokenizer.encode(content, bos=False, eos=False))

            img_encoding = self.mm_encoder(chunk)

            tokens.extend(img_encoding.tokens)
            images.append(img_encoding.image)

    return tokens, images

InstructTokenizerV7(tokenizer, mm_encoder=None)

Bases: InstructTokenizerV3

Instruct tokenizer V7.

The difference with V3 tokenizer is that it encodes the system prompts differently: - in V7 the system prompts are treated as separate SystemMessages. - they are no longer prepended to the last user message. - they are printed between special tokens.

Tool call results are encoded as : - [begin tool call] call_id_tokens [tool_content] content tokens [end tool call].

Parameters:

Name Type Description Default
tokenizer Tokenizer

The tokenizer to use.

required
mm_encoder Optional[MultiModalEncoder]

The multi-modal encoder to use.

None
Source code in src/mistral_common/tokens/tokenizers/instruct.py
def __init__(self, tokenizer: Tokenizer, mm_encoder: Optional[MultiModalEncoder] = None) -> None:
    r"""Initialize the tokenizer.

    Args:
        tokenizer: The tokenizer to use.
        mm_encoder: The multi-modal encoder to use.
    """

    super().__init__(tokenizer, mm_encoder)
    self.BEGIN_SYSTEM = self.tokenizer.get_control_token(SpecialTokens.begin_system.value)
    self.END_SYSTEM = self.tokenizer.get_control_token(SpecialTokens.end_system.value)
    self.BEGIN_TOOL_CONTENT = self.tokenizer.get_control_token(SpecialTokens.begin_tool_content.value)

encode_assistant_message(message, is_before_last_user_message)

Encode an assistant message.

Parameters:

Name Type Description Default
message AssistantMessageType

The message to encode.

required
is_before_last_user_message bool

Not used.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_assistant_message(self, message: AssistantMessageType, is_before_last_user_message: bool) -> List[int]:
    r"""Encode an assistant message.

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.

    Returns:
        The encoded tokens.
    """
    if not message.content and not message.tool_calls:
        raise TokenizerException(f"Invalid assistant message: {message}")
    curr_tokens: list = []
    if message.content:
        if isinstance(message.content, str):
            curr_tokens += self._encode_normal_content_assistant_message(message)
        elif isinstance(message.content, list):
            curr_tokens += self.encode_content_chunks(
                message.content, is_last=False, system_prompt=None, force_img_first=True
            ).tokens
    if message.tool_calls:
        curr_tokens += self._encode_tool_calls_in_assistant_message(message)
    if not message.prefix:
        curr_tokens.append(self.tokenizer.eos_id)

    return curr_tokens

encode_system_message(message)

Encode a system message.

Parameters:

Name Type Description Default
message SystemMessage

The message to encode.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_system_message(self, message: SystemMessage) -> List[int]:
    r"""Encode a system message.

    Args:
        message: The message to encode.

    Returns:
        The encoded tokens.
    """
    assert message.content is not None
    assert isinstance(message.content, str), "Message content must be normalized"
    tokens = [
        self.BEGIN_SYSTEM,
        *self.tokenizer.encode(message.content, bos=False, eos=False),
        self.END_SYSTEM,
    ]
    return tokens

encode_tool_message(message, is_before_last_user_message)

Encode a tool message.

Note

Same as V3 but tools are not wrapped in a list and history is also tokenized

Parameters:

Name Type Description Default
message ToolMessage

The message to encode.

required
is_before_last_user_message bool

Not used.

required

Returns:

Type Description
List[int]

The encoded tokens.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_tool_message(self, message: ToolMessage, is_before_last_user_message: bool) -> List[int]:
    r"""Encode a tool message.

    Note:
        Same as [V3][mistral_common.tokens.tokenizers.instruct.InstructTokenizerV3.encode_tool_message]
        but tools are not wrapped in a list and history is also tokenized

    Args:
        message: The message to encode.
        is_before_last_user_message: Not used.

    Returns:
        The encoded tokens.
    """
    assert message.tool_call_id is not None
    tool_call_id_tokens = self.tokenizer.encode(message.tool_call_id, bos=False, eos=False)
    tokens = self.tokenizer.encode(message.content, bos=False, eos=False)

    prefix_tokens = [
        self.BEGIN_TOOL_RESULTS,
        *tool_call_id_tokens,
        self.BEGIN_TOOL_CONTENT,
    ]
    curr_tokens = [
        *prefix_tokens,
        *tokens,
        self.END_TOOL_RESULTS,
    ]
    return curr_tokens

encode_user_message(message, available_tools, is_last, is_first, system_prompt=None, force_img_first=False)

Encode a user message.

Parameters:

Name Type Description Default
message UserMessage

The message to encode.

required
available_tools Optional[List[Tool]]

The list of available tools if any.

required
is_last bool

Whether the message is the last one.

required
is_first bool

Whether the message is the first one.

required
system_prompt Optional[str]

Not used.

None
force_img_first bool

Whether to force the image to be first.

False

Returns:

Type Description
Tuple[List[int], List[ndarray]]

The encoded tokens and the list of images.

Source code in src/mistral_common/tokens/tokenizers/instruct.py
def encode_user_message(
    self,
    message: UserMessage,
    available_tools: Optional[List[Tool]],
    is_last: bool,
    is_first: bool,
    system_prompt: Optional[str] = None,
    force_img_first: bool = False,
) -> Tuple[List[int], List[np.ndarray]]:
    r"""Encode a user message.

    Args:
        message: The message to encode.
        available_tools: The list of available tools if any.
        is_last: Whether the message is the last one.
        is_first: Whether the message is the first one.
        system_prompt: Not used.
        force_img_first: Whether to force the image to be first.

    Returns:
        The encoded tokens and the list of images.
    """
    assert system_prompt is None, "in Tokenizer V7 we don't encode system prompts in user messages"
    return super().encode_user_message(
        message,
        available_tools,
        is_last=is_last,
        is_first=is_first,
        system_prompt=None,
        force_img_first=force_img_first,
    )