Skip to content

mistral_common.tokens.tokenizers.sentencepiece

SentencePieceTokenizer(model_path, tokenizer_version=None)

Bases: Tokenizer

SentencePiece tokenizer.

Parameters:

Name Type Description Default
model_path Union[str, Path]

The path to the SentencePiece model.

required
tokenizer_version Optional[TokenizerVersion]

The version of the tokenizer. If not provided, it will be inferred from the model path.

None
Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def __init__(self, model_path: Union[str, Path], tokenizer_version: Optional[TokenizerVersion] = None) -> None:
    r"""Initialize the `SentencePieceTokenizer`.

    Args:
        model_path: The path to the `SentencePiece` model.
        tokenizer_version: The version of the tokenizer. If not provided, it will be inferred from the model path.
    """
    self._logger = logging.getLogger(self.__class__.__name__)
    # reload tokenizer
    assert os.path.isfile(model_path), model_path
    self._model = SentencePieceProcessor(
        model_file=model_path if isinstance(model_path, str) else model_path.as_posix()
    )

    assert self._model.vocab_size() == self._model.get_piece_size()
    self._vocab = [self._model.id_to_piece(i) for i in range(self.n_words)]

    self._version: TokenizerVersion = tokenizer_version or get_spm_version(model_path, raise_deprecated=False)

    self._file_path = Path(model_path)
    super().__init__()

bos_id cached property

The beginning of sentence token id.

eos_id cached property

The end of sentence token id.

file_path property

The path to the tokenizer model.

n_words property

Vocabulary size of the tokenizer.

pad_id property

The padding token id.

unk_id property

The unknown token id.

version property

The version of the tokenizer.

decode(tokens, special_token_policy=None)

Decode the given list of token ids into a string.

Note

Using special_token_policy=SpecialTokenPolicy.KEEP will keep the special tokens and the normal tokens as SentencePiece pieces.

Parameters:

Name Type Description Default
tokens List[int]

The list of token ids.

required
special_token_policy Optional[SpecialTokenPolicy]

The policy to use for special tokens. If None, the default policy is SpecialTokenPolicy.IGNORE. Passing None is deprecated and will be changed to SpecialTokenPolicy.IGNORE in mistral_common=1.10.0.

None

Returns:

Type Description
str

The decoded string.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Decode the given list of token ids into a string.

    Note:
        Using `special_token_policy=SpecialTokenPolicy.KEEP` will keep the special tokens and the normal tokens as
        SentencePiece pieces.

    Args:
        tokens: The list of token ids.
        special_token_policy: The policy to use for special tokens. If `None`, the default policy
            is `SpecialTokenPolicy.IGNORE`.  Passing `None` is deprecated and will be changed
            to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """
    if special_token_policy is not None and not isinstance(special_token_policy, SpecialTokenPolicy):
        raise ValueError(
            f"Expected `special_token_policy` to be None or SpecialTokenPolicy, got {type(special_token_policy)}."
        )

    if special_token_policy is None:
        warnings.warn(
            (
                "Using the tokenizer's special token policy `None` is deprecated. "
                "It will be removed in 1.10.0. "
                "Please pass a special token policy explicitly. "
                "Future default will be SpecialTokenPolicy.IGNORE."
            ),
            FutureWarning,
        )
        special_token_policy = SpecialTokenPolicy.IGNORE

    if special_token_policy in [SpecialTokenPolicy.KEEP, SpecialTokenPolicy.RAISE]:
        return self._decode_with_special_tokens(tokens, special_token_policy)

    return self._model.decode(tokens)  # type: ignore

encode(s, bos, eos)

Encode the given string into a list of token ids.

Parameters:

Name Type Description Default
s str

The string to encode.

required
bos bool

Whether to add the beginning of sentence token.

required
eos bool

Whether to add the end of sentence token.

required

Returns:

Type Description
List[int]

The list of token ids.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
    r"""Encode the given string into a list of token ids.

    Args:
        s: The string to encode.
        bos: Whether to add the beginning of sentence token.
        eos: Whether to add the end of sentence token.

    Returns:
        The list of token ids.
    """
    assert isinstance(s, str)
    t: List[int] = self._model.encode(s)
    if bos:
        t = [self.bos_id, *t]
    if eos:
        t = [*t, self.eos_id]
    return t

get_control_token(s)

Get the control token for the given string.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def get_control_token(self, s: str) -> int:
    r"""Get the control token for the given string."""
    return self._model.piece_to_id(s)  # type: ignore

id_to_piece(token_id)

Convert the given token id to a token piece.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def id_to_piece(self, token_id: int) -> str:
    r"""Convert the given token id to a token piece."""
    return self._model.id_to_piece(token_id)  # type: ignore

to_string(tokens)

[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

Use decode with special_token_policy=SpecialTokenPolicy.KEEP instead.

This is a convenient method for debugging.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def to_string(self, tokens: List[int]) -> str:
    r"""[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

    Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

    This is a convenient method for debugging.
    """
    warnings.warn(
        (
            "`to_string` is deprecated and will be removed in 1.10.0. "
            "Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead."
        ),
        FutureWarning,
    )
    return self._to_string(tokens)

vocab()

All tokens in the vocabulary as strings.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def vocab(self) -> List[str]:
    r"""All tokens in the vocabulary as strings."""
    return self._vocab

get_image_config(tokenizer_filename)

Get the image config from the tokenizer filename.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def get_image_config(tokenizer_filename: Union[str, Path]) -> Optional[ImageConfig]:
    r"""Get the image config from the tokenizer filename."""
    tokenizer_filename = str(tokenizer_filename)

    _version_str = tokenizer_filename.split(".")[-1]
    if _version_str == "model" or "m" not in _version_str:
        return None

    _mm_version_str = "m" + _version_str.split("m")[-1]

    if _mm_version_str not in MultiModalVersion.__members__:
        raise TokenizerException(f"Unrecognized tokenizer filename: {tokenizer_filename}")

    return MultiModalVersion(_mm_version_str).config

get_spm_version(tokenizer_filename, raise_deprecated=False)

Get the version of the tokenizer from the filename.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def get_spm_version(tokenizer_filename: Union[str, Path], raise_deprecated: bool = False) -> TokenizerVersion:
    r"""Get the version of the tokenizer from the filename."""
    tokenizer_filename = str(tokenizer_filename)

    _version_str = tokenizer_filename.split(".")[-1]
    if _version_str != "model":  # filter tokenizer_filename == "/path/to/tokenizer.model" case
        _version_str = _version_str.split("m")[0]

    if _version_str == "model":
        if raise_deprecated:
            raise TokenizerException(f"Make sure to rename your tokenizer file to end with {tokenizer_filename}.v1.")

        # tokenizer.model => tokenizer.model.v1
        return TokenizerVersion("v1")

    if _version_str not in TokenizerVersion.__members__:
        raise TokenizerException(f"Unrecognized tokenizer filename: {tokenizer_filename}")

    return TokenizerVersion(_version_str)

is_sentencepiece(path)

Check if the given path is a SentencePiece model.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py
def is_sentencepiece(path: Union[str, Path]) -> bool:
    r"""Check if the given path is a SentencePiece model."""
    if isinstance(path, str):
        path = Path(path)

    instruct_versions = list(TokenizerVersion.__members__)
    mm_versions = list(MultiModalVersion.__members__) + [""]  # allow no mm version
    suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]

    return path.is_file() and any(path.name.endswith(suffix) for suffix in suffixes)