Skip to content

vLLM

VLLM

To begin, start a vLLM server. Be sure to specify --structured-outputs-config.backend guidance if your vLLM version is >0.12.0.

vllm serve RedHatAI/gemma-3-12b-it-quantized.w4a16 --host 0.0.0.0 \
--port 8000 \
--enable-prefix-caching \
--max-model-len 8000 \
--structured-outputs-config.backend guidance \
--gpu_memory_utilization 0.8 \
--enable-prompt-tokens-details

Bases: ModelBase

Class for vLLM endpoints.

Parameters:

Name Type Description Default
model_name_or_path str | None

Name of the model

None
base_url str | None

Base URL for http requests. Defaults to "http://localhost:8000/v1/"

None

Examples:

from blendsql.models import VLLM

model = VLLM("RedHatAI/gemma-3-12b-it-quantized.w4a16", base_url="http://localhost:8000/v1/")
Source code in blendsql/models/vllm.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class VLLM(ModelBase):
    """Class for vLLM endpoints.

    Args:
        model_name_or_path: Name of the model
        base_url: Base URL for http requests. Defaults to "http://localhost:8000/v1/"

    Examples:
        ```python
        from blendsql.models import VLLM

        model = VLLM("RedHatAI/gemma-3-12b-it-quantized.w4a16", base_url="http://localhost:8000/v1/")
        ```
    """

    def __init__(
        self,
        model_name_or_path: str | None = None,
        api_key: str | None = None,
        base_url: str | None = None,
        *args,
        **kwargs,
    ) -> None:
        api_key = api_key or "N.A"
        base_url = base_url or "http://localhost:8000/v1/"
        super().__init__(
            model_name_or_path=model_name_or_path,
            api_key=api_key,
            base_url=base_url,
            *args,
            **kwargs,
        )

    async def _format_inputs(
        self, extra_body: dict, item: GenerationItem
    ) -> tuple[list[dict], dict]:
        if len(item.image_urls) > 0:
            content = [{"type": "text", "text": item.prompt}]
            for image_url in item.image_urls:
                session = await self._get_session()
                encoded = await openai_compatible_image_url(image_url, session)
                content.append({"type": "image_url", "image_url": {"url": encoded}})
            messages = [{"role": "user", "content": content}]
        else:
            messages = [{"role": "user", "content": item.prompt}]

        if item.grammar:
            extra_body |= {
                "guided_decoding_backend": "guidance",
                "guided_grammar": item.grammar,
                "structured_outputs": {"grammar": item.grammar},
            }
        return messages, extra_body