`mistral_common.tokens.tokenizers.tekken`

`ModelData`

Bases: TypedDict

The data of the tekken tokenizer model.

Attributes:

Name	Type	Description
`vocab`	`List[TokenInfo]`	The vocabulary of the tokenizer.
`config`	`TekkenConfig`	The configuration of the tokenizer.
`version`	`int`	The version of the tokenizer.
`type`	`str`	The type of the tokenizer.
`image`	`ImageConfig`	The image configuration of the tokenizer.

`SpecialTokenInfo`

Bases: TypedDict

Special token information in the JSON file.

Attributes:

Name	Type	Description
`rank`	`int`	The rank of the token.
`token_str`	`str`	The token in string format.
`is_control`	`bool`	Whether the token is a control token.

`TekkenConfig`

Bases: TypedDict

Tekken configuration in the JSON file.

Attributes:

Name	Type	Description
`pattern`	`str`	The pattern of the tokenizer.
`num_vocab_tokens`	`int`	The number of vocabulary tokens.
`default_vocab_size`	`int`	The default vocabulary size.
`default_num_special_tokens`	`int`	The default number of special tokens.
`version`	`str`	The version of the tokenizer.

`Tekkenizer(vocab, special_tokens, pattern, vocab_size, num_special_tokens, version, *, name='tekkenizer', _path=None, image_config=None, audio_config=None)`

Bases: Tokenizer

Tekken tokenizer.

This tokenizer is based on the tiktoken library. It fastens the tokenization for multiple languages.

Parameters:

Name	Type	Description	Default
`vocab`	`List[TokenInfo]`	The vocabulary of the tokenizer.	required
`special_tokens`	`List[SpecialTokenInfo]`	The special tokens of the tokenizer.	required
`pattern`	`str`	The pattern of the tokenizer.	required
`vocab_size`	`int`	The vocabulary size of the tokenizer.	required
`num_special_tokens`	`int`	The number of special tokens of the tokenizer.	required
`version`	`TokenizerVersion`	The version of the tokenizer.	required
`name`	`str`	The name of the tokenizer.	`'tekkenizer'`
`image_config`	`Optional[ImageConfig]`	The image configuration of the tokenizer.	`None`

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def __init__(
    self,
    vocab: List[TokenInfo],
    special_tokens: List[SpecialTokenInfo],
    pattern: str,
    vocab_size: int,
    num_special_tokens: int,
    version: TokenizerVersion,
    *,
    name: str = "tekkenizer",
    _path: Optional[Union[str, Path]] = None,
    image_config: Optional[ImageConfig] = None,
    audio_config: Optional[AudioConfig] = None,
):
    r"""Initialize the tekken tokenizer.

    Args:
        vocab: The vocabulary of the tokenizer.
        special_tokens: The special tokens of the tokenizer.
        pattern: The pattern of the tokenizer.
        vocab_size: The vocabulary size of the tokenizer.
        num_special_tokens: The number of special tokens of the tokenizer.
        version: The version of the tokenizer.
        name: The name of the tokenizer.
        image_config: The image configuration of the tokenizer.
    """
    assert vocab_size <= len(vocab) + num_special_tokens, (
        vocab_size,
        len(vocab),
        num_special_tokens,
    )
    self._vocab_size = vocab_size

    # The number of special tokens defined in the tokenizer json
    num_defined_special_tokens = len(set([t["token_str"] for t in special_tokens]))

    assert len(special_tokens) == num_defined_special_tokens, f"Special tokens must be unique: {special_tokens}"
    assert len(special_tokens) <= num_special_tokens

    special_filler = [
        SpecialTokenInfo(rank=i, token_str=self.SPECIAL_TOKEN_TEMPLATE.format(id=i), is_control=True)
        for i in range(len(special_tokens), num_special_tokens)
    ]
    if special_filler:
        logger.info(
            f"Adding special tokens {special_filler[0]['token_str']}, ..., {special_filler[-1]['token_str']}"
        )
    special_tokens = special_tokens + special_filler

    assert len(set([t["token_str"] for t in special_tokens])) == len(special_tokens) == num_special_tokens, (
        special_tokens
    )
    inner_vocab_size = vocab_size - num_special_tokens

    # reload vocab
    self._tekken_token2id_nospecial = _reload_mergeable_ranks(vocab, max_vocab=inner_vocab_size)
    assert set(range(inner_vocab_size)) == set(self._tekken_token2id_nospecial.values()), (
        inner_vocab_size,
        self._tekken_token2id_nospecial,
    )
    self._model = tiktoken.Encoding(
        name=name,
        pat_str=pattern,
        mergeable_ranks=self._tekken_token2id_nospecial,
        special_tokens={},  # special tokens are handled manually
    )

    self._version = version

    self._image_config = image_config
    self._audio_config = audio_config

    self._all_special_tokens = special_tokens
    self._special_tokens_reverse_vocab = {t["token_str"]: t["rank"] for t in special_tokens}
    self._vocab = [self.id_to_piece(i) for i in range(vocab_size)]
    self._special_token_policy = SpecialTokenPolicy.IGNORE
    self._file_path = Path(_path) if _path is not None else None

`audio` `property` `writable`

The audio configuration of the tokenizer.

Returns:

Type	Description
`Optional[AudioConfig]`	The audio configuration object if it exists, otherwise None.

`bos_id` `cached` `property`

The beginning of sentence token id.

`eos_id` `cached` `property`

The end of sentence token id.

`file_path` `property`

The path to the tokenizer file.

`image` `property` `writable`

The image configuration of the tokenizer.

`n_words` `property`

Vocabulary size of the tokenizer.

`num_special_tokens` `property`

The number of special tokens of the tokenizer.

`pad_id` `cached` `property`

The padding token id.

`special_token_policy` `property` `writable`

The policy for handling special tokens.

`unk_id` `cached` `property`

The unknown token id.

`version` `property`

The version of the tokenizer.

`decode(tokens, special_token_policy=None)`

Decode a list of token ids into a string.

Parameters:

Name	Type	Description	Default
`tokens`	`List[int]`	The list of token ids to decode.	required
`special_token_policy`	`Optional[SpecialTokenPolicy]`	The policy for handling special tokens. Use the tokenizer's attribute if `None`. Passing `None` is deprecated and will be changed to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.	`None`

Returns:

Type	Description
`str`	The decoded string.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def decode(self, tokens: List[int], special_token_policy: Optional[SpecialTokenPolicy] = None) -> str:
    r"""Decode a list of token ids into a string.

    Args:
        tokens: The list of token ids to decode.
        special_token_policy: The policy for handling special tokens.
            Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
            if `None`. Passing `None` is deprecated and will be changed
            to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The decoded string.
    """
    if special_token_policy is not None and not isinstance(special_token_policy, SpecialTokenPolicy):
        raise ValueError(
            f"Expected `special_token_policy` to be None or SpecialTokenPolicy, got {type(special_token_policy)}."
        )

    if special_token_policy is None:
        warnings.warn(
            (
                f"Using the tokenizer's special token policy ({self._special_token_policy}) is deprecated. "
                "It will be removed in 1.10.0. "
                "Please pass a special token policy explicitly. "
                "Future default will be SpecialTokenPolicy.IGNORE."
            ),
            FutureWarning,
        )
        special_token_policy = self._special_token_policy

    return "".join(self._decode_all(tokens, special_token_policy=special_token_policy))

`encode(s, bos, eos)`

Encode a string into a list of token ids.

Parameters:

Name	Type	Description	Default
`s`	`str`	The string to encode.	required
`bos`	`bool`	Whether to add the beginning of sentence token.	required
`eos`	`bool`	Whether to add the end of sentence token.	required

Returns:

Type	Description
`List[int]`	The list of token ids.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
    r"""Encode a string into a list of token ids.

    Args:
        s: The string to encode.
        bos: Whether to add the beginning of sentence token.
        eos: Whether to add the end of sentence token.

    Returns:
        The list of token ids.
    """
    tokens: List[int] = self._model.encode(s)
    tokens = [t + self.num_special_tokens for t in tokens]
    if bos:
        tokens = [self.bos_id, *tokens]
    if eos:
        tokens = [*tokens, self.eos_id]
    return tokens

`from_file(path)` `classmethod`

Load the tekken tokenizer from a file.

Parameters:

Name	Type	Description	Default
`path`	`Union[str, Path]`	The path to the tokenizer file.	required

Returns:

Type	Description
`Tekkenizer`	The tekken tokenizer.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

@classmethod
def from_file(cls: Type["Tekkenizer"], path: Union[str, Path]) -> "Tekkenizer":
    r"""Load the tekken tokenizer from a file.

    Args:
        path: The path to the tokenizer file.

    Returns:
        The tekken tokenizer.
    """
    if isinstance(path, str):
        path = Path(path)
    assert path.exists(), path
    with open(path, "r", encoding="utf-8") as f:
        untyped = json.load(f)

    _version_str = untyped["config"].get("version")
    if _version_str not in TokenizerVersion.__members__:
        raise ValueError(
            f"Unknown version: {_version_str} in {path}. "
            f"Make sure to use a valid version string: {list(TokenizerVersion.__members__)}"
        )

    assert _version_str is not None
    version = TokenizerVersion(_version_str)

    special_tokens_dicts: Optional[List[SpecialTokenInfo]] = untyped.get("special_tokens", None)
    if special_tokens_dicts is None:
        # Tokenizer > v7 should find special tokens in the tokenizer file
        if version > TokenizerVersion("v7"):
            raise ValueError(
                f"Special tokens not found in {path}. "
                "Please update your tokenizer file and include all special tokens you need."
            )
        else:
            special_tokens = list(Tekkenizer.DEPRECATED_SPECIAL_TOKENS)
    else:
        special_tokens = [token for token in special_tokens_dicts]

    untyped["special_tokens"] = special_tokens

    if mm := untyped.get("multimodal"):
        # deprecated - only allowed for tokenizers <= v11
        if version > TokenizerVersion("v11"):
            raise ValueError(
                f"The image config has to be called 'image' in {path} for tokenizers of version {version.value}."
            )

        untyped["image"] = ImageConfig(**mm)
    elif image := untyped.get("image"):
        untyped["image"] = ImageConfig(**image)

    if audio := untyped.get("audio"):
        encoding_config = audio.pop("audio_encoding_config")
        encoding_config = AudioSpectrogramConfig(**encoding_config)
        untyped["audio"] = AudioConfig(encoding_config=encoding_config, **audio)

    model_data: ModelData = untyped

    return cls(
        vocab=model_data["vocab"],
        special_tokens=special_tokens,
        pattern=model_data["config"]["pattern"],
        vocab_size=model_data["config"]["default_vocab_size"],
        num_special_tokens=model_data["config"]["default_num_special_tokens"],
        version=version,
        name=path.name.replace(".json", ""),
        image_config=model_data.get("image"),
        audio_config=model_data.get("audio"),
        _path=path,
    )

`get_control_token(s)`

Get the token id of a control token.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def get_control_token(self, s: str) -> int:
    r"""Get the token id of a control token."""
    if s in self._special_tokens_reverse_vocab:
        return self._special_tokens_reverse_vocab[s]
    else:
        raise ValueError(f"Unknown control token {s}")

`id_to_byte_piece(token_id, special_token_policy=None)`

Convert a token id to its byte representation.

Parameters:

Name	Type	Description	Default
`token_id`	`int`	The token id to convert.	required
`special_token_policy`	`Optional[SpecialTokenPolicy]`	The policy for handling special tokens. Use the tokenizer's attribute if `None`. Passing `None` is deprecated and will be changed to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.	`None`

Returns:

Type	Description
`bytes`	The byte representation of the token.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def id_to_byte_piece(self, token_id: int, special_token_policy: Optional[SpecialTokenPolicy] = None) -> bytes:
    r"""Convert a token id to its byte representation.

    Args:
        token_id: The token id to convert.
        special_token_policy: The policy for handling special tokens.
            Use the tokenizer's [attribute][mistral_common.tokens.tokenizers.tekken.Tekkenizer.special_token_policy]
            if `None`. Passing `None` is deprecated and will be changed
            to `SpecialTokenPolicy.IGNORE` in `mistral_common=1.10.0`.

    Returns:
        The byte representation of the token.
    """
    if special_token_policy is None:
        warnings.warn(
            (
                f"Using the tokenizer's special token policy ({self._special_token_policy}) is deprecated. "
                "It will be removed in 1.10.0. "
                "Please pass a special token policy explicitly. "
                "Future default will be SpecialTokenPolicy.IGNORE."
            ),
            FutureWarning,
        )
        special_token_policy = self._special_token_policy

    if token_id < self.num_special_tokens:
        if special_token_policy == SpecialTokenPolicy.KEEP:
            return self._all_special_tokens[token_id]["token_str"].encode("utf-8")
        elif special_token_policy == SpecialTokenPolicy.RAISE:
            raise ValueError(f"{token_id} is a special token")
        elif special_token_policy == SpecialTokenPolicy.IGNORE:
            return b""
        else:
            raise ValueError(f"Unknown special token policy {special_token_policy}")

    return self._model.decode_single_token_bytes(token_id - self.num_special_tokens)

`id_to_piece(token_id)`

Convert a token id to its string representation.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def id_to_piece(self, token_id: int) -> str:
    r"""Convert a token id to its string representation."""
    return self.decode([token_id], special_token_policy=SpecialTokenPolicy.KEEP)

`is_byte(token_id)`

Check if a token id is a byte token.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def is_byte(self, token_id: int) -> bool:
    r"""Check if a token id is a byte token."""
    return 0 <= token_id - self.num_special_tokens < 256

`to_string(tokens)`

[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

Use decode with special_token_policy=SpecialTokenPolicy.KEEP instead.

This is a convenient method for debugging.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def to_string(self, tokens: List[int]) -> str:
    r"""[DEPRECATED] Converts a list of token ids into a string, keeping special tokens.

    Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead.

    This is a convenient method for debugging.
    """
    warnings.warn(
        (
            "`to_string` is deprecated and will be removed in 1.10.0. "
            "Use `decode` with `special_token_policy=SpecialTokenPolicy.KEEP` instead."
        ),
        FutureWarning,
    )
    return self._to_string(tokens)

`vocab()`

All tokens in the vocabulary as strings.

Note

This will collapse all tokens for which we have a decoding error into the <?> string. This is bad and results in things like len(set(vocab)) != len(vocab)).

Returns:

Type	Description
`List[str]`	The vocabulary of the tokenizer.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def vocab(self) -> List[str]:
    r"""All tokens in the vocabulary as strings.

    Note:
       This will collapse all tokens for which we have a decoding error into
       the <?> string. This is bad and results in things like len(set(vocab)) != len(vocab)).

    Returns:
        The vocabulary of the tokenizer.
    """
    # when returning self._vocab this will collapse
    # all tokens for which we have a decoding error into
    # the <?> string. This is bad and results in things
    # like len(set(vocab)) != len(vocab))
    # be careful when using self._vocab
    return self._vocab

`TokenInfo`

Bases: TypedDict

Token information in the JSON file.

Attributes:

Name	Type	Description
`rank`	`int`	The rank of the token.
`token_bytes`	`str`	The token in bytes, base64 encoded.
`token_str`	`Optional[str]`	The token in string format.

`is_tekken(path)`

Check if the given path is a tekken tokenizer file.

Source code in src/mistral_common/tokens/tokenizers/tekken.py

def is_tekken(path: Union[str, Path]) -> bool:
    r"""Check if the given path is a tekken tokenizer file."""
    if isinstance(path, str):
        path = Path(path)
    return path.is_file() and "tekken" in path.name and path.suffix == ".json"

mistral_common.tokens.tokenizers.tekken

ModelData

SpecialTokenInfo

TekkenConfig

Tekkenizer(vocab, special_tokens, pattern, vocab_size, num_special_tokens, version, *, name='tekkenizer', _path=None, image_config=None, audio_config=None)

audio property writable

bos_id cached property

eos_id cached property

file_path property

image property writable

n_words property

num_special_tokens property

pad_id cached property

special_token_policy property writable

unk_id cached property

version property

decode(tokens, special_token_policy=None)

encode(s, bos, eos)

from_file(path) classmethod

get_control_token(s)

id_to_byte_piece(token_id, special_token_policy=None)

id_to_piece(token_id)

is_byte(token_id)

to_string(tokens)

vocab()

TokenInfo

is_tekken(path)

`mistral_common.tokens.tokenizers.tekken`

`ModelData`

`SpecialTokenInfo`

`TekkenConfig`

`Tekkenizer(vocab, special_tokens, pattern, vocab_size, num_special_tokens, version, *, name='tekkenizer', _path=None, image_config=None, audio_config=None)`

`audio` `property` `writable`

`bos_id` `cached` `property`

`eos_id` `cached` `property`

`file_path` `property`

`image` `property` `writable`

`n_words` `property`

`num_special_tokens` `property`

`pad_id` `cached` `property`

`special_token_policy` `property` `writable`

`unk_id` `cached` `property`

`version` `property`

`decode(tokens, special_token_policy=None)`

`encode(s, bos, eos)`

`from_file(path)` `classmethod`

`get_control_token(s)`

`id_to_byte_piece(token_id, special_token_policy=None)`

`id_to_piece(token_id)`

`is_byte(token_id)`

`to_string(tokens)`

`vocab()`

`TokenInfo`

`is_tekken(path)`