`mistral_common.tokens.tokenizers.sentencepiece`

`SentencePieceTokenizer(model_path, tokenizer_version=None)`

Bases: Tokenizer

Parameters:

Name	Type	Description	Default
`model_path`	`Union[str, Path]`	The path to the `SentencePiece` model.	required
`tokenizer_version`	`Optional[TokenizerVersion]`	The version of the tokenizer. If not provided, it will be inferred from the model path.	`None`

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def __init__(self, model_path: Union[str, Path], tokenizer_version: Optional[TokenizerVersion] = None) -> None:
    r"""Initialize the `SentencePieceTokenizer`.

    Args:
        model_path: The path to the `SentencePiece` model.
        tokenizer_version: The version of the tokenizer. If not provided, it will be inferred from the model path.
    """
    self._logger = logging.getLogger(self.__class__.__name__)
    # reload tokenizer
    assert os.path.isfile(model_path), model_path
    self._model = SentencePieceProcessor(
        model_file=model_path if isinstance(model_path, str) else model_path.as_posix()
    )

    assert self._model.vocab_size() == self._model.get_piece_size()
    self._vocab = [self._model.id_to_piece(i) for i in range(self.n_words)]

    self._version: TokenizerVersion = tokenizer_version or get_spm_version(model_path, raise_deprecated=False)

    self._file_path = Path(model_path)
    super().__init__()

`bos_id` `cached` `property`

The beginning of sentence token id.

`eos_id` `cached` `property`

The end of sentence token id.

`file_path` `property`

The path to the tokenizer model.

`n_words` `property`

Vocabulary size of the tokenizer.

`pad_id` `property`

The padding token id.

`unk_id` `property`

The unknown token id.

`version` `property`

The version of the tokenizer.

`decode(tokens, special_token_policy=None)`

Decode the given list of token ids into a string.

Note

Using special_token_policy=SpecialTokenPolicy.KEEP will keep the special tokens and the normal tokens as SentencePiece pieces.

Parameters:

Name	Type	Description	Default
`tokens`	`List[int]`	The list of token ids.	required
`special_token_policy`	`Optional[SpecialTokenPolicy]`	The policy to use for special tokens. If `None`, the default policy is `SpecialTokenPolicy.IGNORE`. Passing `None` is deprecated and will be changed to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.	`None`

Returns:

Type	Description
`str`	The decoded string.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Decode the given list of token ids into a string.

    Note:
        Using `special_token_policy=SpecialTokenPolicy.KEEP` will keep the special tokens and the normal tokens as
        SentencePiece pieces.

    Args:
        tokens: The list of token ids.
        special_token_policy: The policy to use for special tokens. If `None`, the default policy
            is `SpecialTokenPolicy.IGNORE`.  Passing `None` is deprecated and will be changed
            to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """
    if special_token_policy is not None and not isinstance(special_token_policy, SpecialTokenPolicy):
        raise ValueError(
            f"Expected `special_token_policy` to be None or SpecialTokenPolicy, got {type(special_token_policy)}."
        )

    if special_token_policy is None:
        warnings.warn(
            (
                "Using the tokenizer's special token policy `None` is deprecated. "
                "It will be removed in 1.10.0. "
                "Please pass a special token policy explicitly. "
                "Future default will be SpecialTokenPolicy.IGNORE."
            ),
            FutureWarning,
        )
        special_token_policy = SpecialTokenPolicy.IGNORE

    if special_token_policy in [SpecialTokenPolicy.KEEP, SpecialTokenPolicy.RAISE]:
        return self._decode_with_special_tokens(tokens, special_token_policy)

    return self._model.decode(tokens)  # type: ignore

`encode(s, bos, eos)`

Encode the given string into a list of token ids.

Parameters:

Name	Type	Description	Default
`s`	`str`	The string to encode.	required
`bos`	`bool`	Whether to add the beginning of sentence token.	required
`eos`	`bool`	Whether to add the end of sentence token.	required

Returns:

Type	Description
`List[int]`	The list of token ids.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
    r"""Encode the given string into a list of token ids.

    Args:
        s: The string to encode.
        bos: Whether to add the beginning of sentence token.
        eos: Whether to add the end of sentence token.

    Returns:
        The list of token ids.
    """
    assert isinstance(s, str)
    t: List[int] = self._model.encode(s)
    if bos:
        t = [self.bos_id, *t]
    if eos:
        t = [*t, self.eos_id]
    return t

`get_control_token(s)`

Get the control token for the given string.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def get_control_token(self, s: str) -> int:
    r"""Get the control token for the given string."""
    return self._model.piece_to_id(s)  # type: ignore

`id_to_piece(token_id)`

Convert the given token id to a token piece.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def id_to_piece(self, token_id: int) -> str:
    r"""Convert the given token id to a token piece."""
    return self._model.id_to_piece(token_id)  # type: ignore

`to_string(tokens)`

[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

Use decode with special_token_policy=SpecialTokenPolicy.KEEP instead.

This is a convenient method for debugging.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def to_string(self, tokens: List[int]) -> str:
    r"""[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

    Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

    This is a convenient method for debugging.
    """
    warnings.warn(
        (
            "`to_string` is deprecated and will be removed in 1.10.0. "
            "Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead."
        ),
        FutureWarning,
    )
    return self._to_string(tokens)

`vocab()`

All tokens in the vocabulary as strings.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def vocab(self) -> List[str]:
    r"""All tokens in the vocabulary as strings."""
    return self._vocab

`get_image_config(tokenizer_filename)`

Get the image config from the tokenizer filename.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def get_image_config(tokenizer_filename: Union[str, Path]) -> Optional[ImageConfig]:
    r"""Get the image config from the tokenizer filename."""
    tokenizer_filename = str(tokenizer_filename)

    _version_str = tokenizer_filename.split(".")[-1]
    if _version_str == "model" or "m" not in _version_str:
        return None

    _mm_version_str = "m" + _version_str.split("m")[-1]

    if _mm_version_str not in MultiModalVersion.__members__:
        raise TokenizerException(f"Unrecognized tokenizer filename: {tokenizer_filename}")

    return MultiModalVersion(_mm_version_str).config

`get_spm_version(tokenizer_filename, raise_deprecated=False)`

Get the version of the tokenizer from the filename.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def get_spm_version(tokenizer_filename: Union[str, Path], raise_deprecated: bool = False) -> TokenizerVersion:
    r"""Get the version of the tokenizer from the filename."""
    tokenizer_filename = str(tokenizer_filename)

    _version_str = tokenizer_filename.split(".")[-1]
    if _version_str != "model":  # filter tokenizer_filename == "/path/to/tokenizer.model" case
        _version_str = _version_str.split("m")[0]

    if _version_str == "model":
        if raise_deprecated:
            raise TokenizerException(f"Make sure to rename your tokenizer file to end with {tokenizer_filename}.v1.")

        # tokenizer.model => tokenizer.model.v1
        return TokenizerVersion("v1")

    if _version_str not in TokenizerVersion.__members__:
        raise TokenizerException(f"Unrecognized tokenizer filename: {tokenizer_filename}")

    return TokenizerVersion(_version_str)

`is_sentencepiece(path)`

Check if the given path is a SentencePiece model.

Source code in src/mistral_common/tokens/tokenizers/sentencepiece.py

def is_sentencepiece(path: Union[str, Path]) -> bool:
    r"""Check if the given path is a SentencePiece model."""
    if isinstance(path, str):
        path = Path(path)

    instruct_versions = list(TokenizerVersion.__members__)
    mm_versions = list(MultiModalVersion.__members__) + [""]  # allow no mm version
    suffixes = [f".model.{v}{m}" for v in instruct_versions for m in mm_versions] + [".model"]

    return path.is_file() and any(path.name.endswith(suffix) for suffix in suffixes)

mistral_common.tokens.tokenizers.sentencepiece

SentencePieceTokenizer(model_path, tokenizer_version=None)

bos_id cached property

eos_id cached property

file_path property

n_words property

pad_id property

unk_id property

version property

decode(tokens, special_token_policy=None)

encode(s, bos, eos)

get_control_token(s)

id_to_piece(token_id)

to_string(tokens)

vocab()

get_image_config(tokenizer_filename)

get_spm_version(tokenizer_filename, raise_deprecated=False)

is_sentencepiece(path)

`mistral_common.tokens.tokenizers.sentencepiece`

`SentencePieceTokenizer(model_path, tokenizer_version=None)`

`bos_id` `cached` `property`

`eos_id` `cached` `property`

`file_path` `property`

`n_words` `property`

`pad_id` `property`

`unk_id` `property`

`version` `property`

`decode(tokens, special_token_policy=None)`

`encode(s, bos, eos)`

`get_control_token(s)`

`id_to_piece(token_id)`

`to_string(tokens)`

`vocab()`

`get_image_config(tokenizer_filename)`

`get_spm_version(tokenizer_filename, raise_deprecated=False)`

`is_sentencepiece(path)`