Skip to content

Transformers

Installation

You need to install llama-cpp-python to use this in blendsql. I used this command to install it on Ubuntu 25.10:

CMAKE_ARGS="-DGGML_CUDA=ON -DLLAMA_LLAVA=OFF -DLLAVA_BUILD=OFF" uv pip install llama-cpp-python --upgrade --force-reinstall --no-cache-dir

LlamaCpp

Bases: ConstrainedModel

Class for LlamaCpp local Model.

Parameters:

Name Type Description Default
filename str

Specific .gguf file (local or on HuggingFace) to load

required
model_name_or_path Optional[str]

Optional path to the model on HuggingFace

None
caching bool

Bool determining whether we access the model's cache

True
config Optional[dict]

Additional parameters to pass to the Llama() construction call

None

Examples:

from blendsql.models import LlamaCpp

model = LlamaCpp(
    filename="google_gemma-3-12b-it-Q6_K.gguf",
    model_name_or_path="bartowski/google_gemma-3-12b-it-GGUF",
    config={"n_gpu_layers": -1, "n_ctx": 8000, "seed": 100, "n_threads": 16},
)
Source code in blendsql/models/constrained/guidance.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
class LlamaCpp(ConstrainedModel):
    """Class for LlamaCpp local Model.

    Args:
        filename: Specific .gguf file (local or on HuggingFace) to load
        model_name_or_path: Optional path to the model on HuggingFace
        caching: Bool determining whether we access the model's cache
        config: Additional parameters to pass to the `Llama()` construction call

    Examples:
        ```python
        from blendsql.models import LlamaCpp

        model = LlamaCpp(
            filename="google_gemma-3-12b-it-Q6_K.gguf",
            model_name_or_path="bartowski/google_gemma-3-12b-it-GGUF",
            config={"n_gpu_layers": -1, "n_ctx": 8000, "seed": 100, "n_threads": 16},
        )
        ```
    """

    def __init__(
        self,
        filename: str,
        model_name_or_path: Optional[str] = None,
        config: Optional[dict] = None,
        caching: bool = True,
        **kwargs,
    ):
        if config is None:
            config = {}

        super().__init__(
            model_name_or_path=model_name_or_path,  # type: ignore
            requires_config=False,
            tokenizer=self._load_llama_cpp(
                filename=filename,
                model_name_or_path=model_name_or_path,
                config=config,
                vocab_only=True,
            ).tokenizer_,
            config=config,
            caching=caching,
            **kwargs,
        )
        self.filename = filename

    @staticmethod
    def _load_llama_cpp(
        filename: str,
        model_name_or_path: Optional[str],
        config: dict,
        vocab_only: bool = False,
    ):
        from llama_cpp import Llama

        if model_name_or_path:
            _config = config if not vocab_only else {}
            model = Llama.from_pretrained(
                repo_id=model_name_or_path,
                filename=filename,
                verbose=False,
                vocab_only=vocab_only,
                **_config,
            )
        else:
            model = Llama(filename, verbose=False, vocab_only=vocab_only, **config)

        # https://github.com/abetlen/llama-cpp-python/issues/1610
        import atexit

        @atexit.register
        def free_model():
            model.close()

        return model

    def _load_model(self) -> ModelObj:
        from guidance.models import LlamaCpp as GuidanceLlamaCpp
        import logging

        logging.getLogger("guidance").setLevel(logging.CRITICAL)
        logging.getLogger("llama_cpp").setLevel(logging.CRITICAL)

        # llama.cpp doesn't like when we have two running simultaneously
        #   so we do a little switcheroo with the tokenizer here
        if hasattr(self, "tokenizer"):
            self.__delattr__("tokenizer")

        lm = GuidanceLlamaCpp(
            self._load_llama_cpp(
                filename=self.filename,
                model_name_or_path=self.model_name_or_path,
                config=self.config,
            ),
            echo=False,
            chat_template=self.config.get("chat_template"),
        )
        self.tokenizer = lm.engine.model_obj.tokenizer_
        return lm