Add support for the Chat input to the MLXLM model

RobinPicard · RobinPicard · commit 6533d5d80df3 · 2025-09-01T17:03:46.000+02:00
diff --git a/docs/features/models/llamacpp.md b/docs/features/models/llamacpp.md
@@ -81,7 +81,7 @@ prompt = Chat([
 
 # Call the model to generate a response
 response = model(prompt, max_tokens=50)
-print(response) # 'This is a picture of a black dog.'
+print(response) # 'Riga.'
 ```
 
 #### Streaming
diff --git a/docs/features/models/mlxlm.md b/docs/features/models/mlxlm.md
@@ -29,7 +29,7 @@ import mlx_lm
 
 # Create the model
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 ```
 
@@ -45,14 +45,43 @@ import mlx_lm
 
 # Load the model
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 
 # Call it to generate text
 result = model("What's the capital of Latvia?", max_tokens=20)
 print(result) # 'Riga'
 ```
 
+#### Chat
+
+You can use chat inputs with the `MLXLM` model. To do so, call the model with a `Chat` instance.
+
+For instance:
+
+```python
+import outlines
+import mlx_lm
+from outlines.inputs import Chat
+
+# Load the model
+model = outlines.from_mlxlm(
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
+)
+
+# Create the prompt containing the text and the image
+prompt = Chat([
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "assistant", "content": "What's the capital of Latvia?"},
+])
+
+# Call the model to generate a response
+response = model(prompt, max_tokens=50)
+print(response) # 'Riga.'
+```
+
+#### Streaming
+
 The `MLXLM` model also supports streaming. For instance:
 
 ```python
@@ -61,7 +90,7 @@ import mlx_lm
 
 # Load the model
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 
 # Stream text
@@ -73,7 +102,7 @@ for chunk in model.stream("Write a short story about a cat.", max_tokens=100):
 
 As a local model, `MLXLM` supports all forms of structured generation available in Outlines.
 
-### Basic Type
+#### Basic Type
 
 ```python
 import outlines
@@ -82,14 +111,14 @@ import mlx_lm
 output_type = int
 
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 
 result = model("How many countries are there in the world?", output_type)
 print(result) # '200'
 ```
 
-### JSON Schema
+#### JSON Schema
 
 ```python
 from pydantic import BaseModel
@@ -103,15 +132,15 @@ class Character(BaseModel):
     skills: List[str]
 
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 
 result = model("Create a character.", output_type=Character)
 print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
 print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
 ```
 
-### Multiple Choice
+#### Multiple Choice
 
 ```python
 from typing import Literal
@@ -121,14 +150,14 @@ import mlx_lm
 output_type = Literal["Paris", "London", "Rome", "Berlin"]
 
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 
 result = model("What is the capital of France?", output_type)
 print(result) # 'Paris'
 ```
 
-### Regex
+#### Regex
 
 ```python
 from outlines.types import Regex
@@ -138,14 +167,14 @@ import mlx_lm
 output_type = Regex(r"\d{3}-\d{2}-\d{4}")
 
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 
 result = model("Generate a fake social security number.", output_type)
 print(result) # '782-32-3789'
 ```
 
-### Context-Free Grammar
+#### Context-Free Grammar
 
 ```python
 from outlines.types import CFG
@@ -175,7 +204,7 @@ arithmetic_grammar = """
 output_type = CFG(arithmetic_grammar)
 
 model = outlines.from_mlxlm(
-    *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
+    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
 )
 
 result = model("Write an addition.", output_type, max_tokens=20)
diff --git a/outlines/models/mlxlm.py b/outlines/models/mlxlm.py
@@ -3,6 +3,7 @@
 from functools import singledispatchmethod
 from typing import TYPE_CHECKING, Iterator, List, Optional
 
+from outlines.inputs import Chat
 from outlines.models.base import Model, ModelTypeAdapter
 from outlines.models.transformers import TransformerTokenizer
 from outlines.processors import OutlinesLogitsProcessor
@@ -17,6 +18,9 @@
 class MLXLMTypeAdapter(ModelTypeAdapter):
     """Type adapter for the `MLXLM` model."""
 
+    def __init__(self, **kwargs):
+        self.tokenizer = kwargs.get("tokenizer")
+
     @singledispatchmethod
     def format_input(self, model_input):
         """Generate the prompt argument to pass to the model.
@@ -34,13 +38,30 @@ def format_input(self, model_input):
         """
         raise NotImplementedError(
             f"The input type {input} is not available with mlx-lm. "
-            "The only available type is `str`."
+            "The available types are `str` and `Chat`."
         )
 
     @format_input.register(str)
     def format_str_input(self, model_input: str):
         return model_input
 
+    @format_input.register(Chat)
+    def format_chat_input(self, model_input: Chat) -> str:
+        if not all(
+            isinstance(message["content"], str)
+            for message in model_input.messages
+        ):
+            raise ValueError(
+                "mlx-lm does not support multi-modal messages."
+                + "The content of each message must be a string."
+            )
+
+        return self.tokenizer.apply_chat_template(
+            model_input.messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
     def format_output_type(
         self, output_type: Optional[OutlinesLogitsProcessor] = None,
     ) -> Optional[List[OutlinesLogitsProcessor]]:
@@ -92,7 +113,7 @@ def __init__(
         self.mlx_tokenizer = tokenizer
         # self.tokenizer is used by the logits processor
         self.tokenizer = TransformerTokenizer(tokenizer._tokenizer)
-        self.type_adapter = MLXLMTypeAdapter()
+        self.type_adapter = MLXLMTypeAdapter(tokenizer=tokenizer)
 
     def generate(
         self,
diff --git a/tests/models/test_mlxlm_type_adapter.py b/tests/models/test_mlxlm_type_adapter.py
@@ -0,0 +1,84 @@
+import pytest
+import io
+
+from outlines_core import Index, Vocabulary
+from PIL import Image as PILImage
+
+from outlines.backends.outlines_core import OutlinesCoreLogitsProcessor
+from outlines.inputs import Chat, Image
+from outlines.models.mlxlm import MLXLMTypeAdapter
+
+try:
+    import mlx_lm
+    import mlx.core as mx
+
+    HAS_MLX = mx.metal.is_available()
+except ImportError:
+    HAS_MLX = False
+
+
+MODEL_NAME = "mlx-community/SmolLM-135M-Instruct-4bit"
+
+
+@pytest.fixture
+def adapter():
+    _, tokenizer = mlx_lm.load(MODEL_NAME)
+    return MLXLMTypeAdapter(tokenizer=tokenizer)
+
+
+@pytest.fixture
+def logits_processor():
+    vocabulary = Vocabulary.from_pretrained(MODEL_NAME)
+    index = Index(r"[0-9]{3}", vocabulary)
+    return OutlinesCoreLogitsProcessor(index, "mlx")
+
+
+@pytest.fixture
+def image():
+    width, height = 1, 1
+    white_background = (255, 255, 255)
+    image = PILImage.new("RGB", (width, height), white_background)
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    buffer.seek(0)
+    image = PILImage.open(buffer)
+
+    return image
+
+
+@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
+def test_mlxlm_type_adapter_format_input(adapter, image):
+    # Anything else than a string/Chat (invalid)
+    with pytest.raises(NotImplementedError):
+        adapter.format_input(["Hello, world!"])
+
+    # String
+    assert adapter.format_input("Hello, world!") == "Hello, world!"
+
+    # Chat
+    messages = [
+        {"role": "user", "content": "Hello, world!"},
+        {"role": "assistant", "content": "Hello, world!"},
+    ]
+    expected = (
+        "<|im_start|>user\nHello, world!<|im_end|>\n<|im_start|>assistant\n"
+        + "Hello, world!<|im_end|>\n<|im_start|>assistant\n"
+    )
+    assert adapter.format_input(Chat(messages=messages)) == expected
+
+    # Multi-modal (invalid)
+    with pytest.raises(
+        ValueError,
+        match="mlx-lm does not support multi-modal messages."
+    ):
+        adapter.format_input(Chat(messages=[
+            {"role": "user", "content": ["prompt", Image(image)]},
+        ]))
+
+
+@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
+def test_mlxlm_type_adapter_format_output_type(adapter, logits_processor):
+    formatted = adapter.format_output_type(logits_processor)
+    assert isinstance(formatted, list)
+    assert len(formatted) == 1
+    assert isinstance(formatted[0], OutlinesCoreLogitsProcessor)