Skip to content

mistral_common.tokens.tokenizers.multimodal

ImageEncoder(mm_config, special_ids)

Bases: MultiModalEncoder

Image encoder for the multimodal tokenizer.

Parameters:

Name Type Description Default
mm_config MultimodalConfig

Configuration for the multimodal tokenizer.

required
special_ids SpecialImageIDs

Special image tokens ids.

required
Source code in src/mistral_common/tokens/tokenizers/multimodal.py
def __init__(self, mm_config: MultimodalConfig, special_ids: SpecialImageIDs) -> None:
    r"""Initialize the image encoder.

    Args:
        mm_config: Configuration for the multimodal tokenizer.
        special_ids: Special image tokens ids.
    """
    self.mm_config = mm_config
    self.special_ids = special_ids

__call__(content)

Converts an image chunk to an image encoding.

Parameters:

Name Type Description Default
content Union[ImageChunk, ImageURLChunk]

image chunk to be converted.

required

Returns:

Type Description
ImageEncoding

Image encoding.

Source code in src/mistral_common/tokens/tokenizers/multimodal.py
def __call__(self, content: Union[ImageChunk, ImageURLChunk]) -> ImageEncoding:
    r"""Converts an image chunk to an image encoding.

    Args:
        content: image chunk to be converted.

    Returns:
        Image encoding.
    """
    image = image_from_chunk(content)
    w, h = self._image_to_num_tokens(image)
    assert w > 0
    assert h > 0
    image_tokens = ([self.special_ids.img] * w + [self.special_ids.img_break]) * h
    image_tokens[-1] = self.special_ids.img_end
    new_image_size = (
        w * self.mm_config.image_patch_size * self.mm_config.spatial_merge_size,
        h * self.mm_config.image_patch_size * self.mm_config.spatial_merge_size,
    )
    processed_image = transform_image(image, new_image_size)
    return ImageEncoding(tokens=image_tokens, image=processed_image)

MultiModalVersion

Bases: str, Enum

Version of the multimodal tokenizer.

MultimodalConfig(image_patch_size, max_image_size, spatial_merge_size=1) dataclass

Configuration for the multimodal tokenizers.

image_from_chunk(chunk)

Get a serializable image from a chunk.

Parameters:

Name Type Description Default
chunk Union[ImageURLChunk, ImageChunk]

The chunk to get the image from.

required

Returns:

Type Description
SerializableImage

The image as a PIL Image object.

Source code in src/mistral_common/tokens/tokenizers/multimodal.py
def image_from_chunk(chunk: Union[ImageURLChunk, ImageChunk]) -> SerializableImage:
    r"""Get a serializable image from a chunk.

    Args:
        chunk: The chunk to get the image from.

    Returns:
        The image as a PIL Image object.
    """
    if isinstance(chunk, ImageChunk):
        return chunk.image
    if chunk.get_url().startswith("data:image"):
        data = chunk.get_url().split(",")[1]
        image_data = base64.b64decode(data)
        return Image.open(BytesIO(image_data))
    if chunk.get_url().startswith("file"):
        return Image.open(open(chunk.get_url().replace("file://", ""), "rb"))
    if chunk.get_url().startswith("http"):
        return download_image(chunk.get_url())

    raise RuntimeError(f"Unsupported image url scheme {chunk.get_url()}")

is_cv2_installed()

Check if OpenCV is installed.

Source code in src/mistral_common/tokens/tokenizers/multimodal.py
def is_cv2_installed() -> bool:
    r"""Check if OpenCV is installed."""
    return _cv2_installed

normalize(np_image, mean, std)

Normalize a tensor image with mean and standard deviation.

Parameters:

Name Type Description Default
np_image ndarray

Image to be normalized.

required
mean Tuple[float, float, float]

Mean for each channel.

required
std Tuple[float, float, float]

Standard deviation for each channel.

required

Returns:

Type Description
ndarray

Normalized image with shape (C, H, W).

Source code in src/mistral_common/tokens/tokenizers/multimodal.py
def normalize(
    np_image: np.ndarray,
    mean: Tuple[float, float, float],
    std: Tuple[float, float, float],
) -> np.ndarray:
    r"""Normalize a tensor image with mean and standard deviation.

    Args:
        np_image: Image to be normalized.
        mean: Mean for each channel.
        std: Standard deviation for each channel.

    Returns:
        Normalized image with shape (C, H, W).
    """
    np_image = np_image / 255.0

    assert len(np_image.shape) == 3, f"{np_image.shape=}"
    assert np_image.shape[2] == len(mean) == len(std), f"{np_image.shape=}, {mean=}, {std=}"

    np_image = (np_image - mean) / std

    return np_image.transpose(2, 0, 1)

transform_image(image, new_size)

Transform an image to a numpy array with the given size.

Parameters:

Name Type Description Default
image Image

Image to be transformed.

required
new_size Tuple[int, int]

New size of the image.

required

Returns:

Type Description
ndarray

Transformed image with shape (C, H, W).

Source code in src/mistral_common/tokens/tokenizers/multimodal.py
def transform_image(image: Image.Image, new_size: Tuple[int, int]) -> np.ndarray:
    r"""Transform an image to a numpy array with the given size.

    Args:
        image: Image to be transformed.
        new_size: New size of the image.

    Returns:
        Transformed image with shape (C, H, W).
    """
    if not is_cv2_installed():
        raise ImportError("OpenCV is required for this function. Install it with 'pip install mistral_common[opencv]'")

    np_image = cv2.resize(np.array(_convert_to_rgb(image), dtype=np.float32), new_size, interpolation=cv2.INTER_CUBIC)
    return normalize(np_image, DATASET_MEAN, DATASET_STD)