Transformers

Installation

You need to install llama-cpp-python to use this in blendsql. I used this command to install it on Ubuntu 25.10:

CMAKE_ARGS="-DGGML_CUDA=ON -DLLAMA_LLAVA=OFF -DLLAVA_BUILD=OFF" uv pip install llama-cpp-python --upgrade --force-reinstall --no-cache-dir

LlamaCpp

Bases: ConstrainedModel

Class for LlamaCpp local Model.

Parameters:

Name	Type	Description	Default
`filename`	`str`	Specific .gguf file (local or on HuggingFace) to load	required
`model_name_or_path`	`str \| None`	Optional path to the model on HuggingFace	`None`
`caching`	`bool`	Bool determining whether we access the model's cache	`True`
`config`	`dict \| None`	Additional parameters to pass to the `Llama()` construction call	`None`

Examples:

from blendsql.models import LlamaCpp

model = LlamaCpp(
    filename="google_gemma-3-12b-it-Q6_K.gguf",
    model_name_or_path="bartowski/google_gemma-3-12b-it-GGUF",
    config={"n_gpu_layers": -1, "n_ctx": 8000, "seed": 100, "n_threads": 16},
)

Source code in blendsql/models/constrained/guidance.py

class LlamaCpp(ConstrainedModel):
    """Class for LlamaCpp local Model.

    Args:
        filename: Specific .gguf file (local or on HuggingFace) to load
        model_name_or_path: Optional path to the model on HuggingFace
        caching: Bool determining whether we access the model's cache
        config: Additional parameters to pass to the `Llama()` construction call

    Examples:
        ```python
        from blendsql.models import LlamaCpp

        model = LlamaCpp(
            filename="google_gemma-3-12b-it-Q6_K.gguf",
            model_name_or_path="bartowski/google_gemma-3-12b-it-GGUF",
            config={"n_gpu_layers": -1, "n_ctx": 8000, "seed": 100, "n_threads": 16},
        )
        ```
    """

    def __init__(
        self,
        filename: str,
        model_name_or_path: str | None = None,
        config: dict | None = None,
        caching: bool = True,
        **kwargs,
    ):
        if config is None:
            config = {}

        super().__init__(
            model_name_or_path=model_name_or_path,  # type: ignore
            requires_config=False,
            tokenizer=self._load_llama_cpp(
                filename=filename,
                model_name_or_path=model_name_or_path,
                config=config,
                vocab_only=True,
            ).tokenizer_,
            config=config,
            caching=caching,
            **kwargs,
        )
        self.filename = filename

    @staticmethod
    def _load_llama_cpp(
        filename: str,
        model_name_or_path: str | None,
        config: dict,
        vocab_only: bool = False,
    ):
        from llama_cpp import Llama

        if model_name_or_path:
            _config = config if not vocab_only else {}
            model = Llama.from_pretrained(
                repo_id=model_name_or_path,
                filename=filename,
                verbose=False,
                vocab_only=vocab_only,
                **_config,
            )
        else:
            model = Llama(filename, verbose=False, vocab_only=vocab_only, **config)

        # https://github.com/abetlen/llama-cpp-python/issues/1610
        import atexit

        @atexit.register
        def free_model():
            model.close()

        return model

    def _load_model(self) -> ModelObj:
        from guidance.models import LlamaCpp as GuidanceLlamaCpp
        import logging

        logging.getLogger("guidance").setLevel(logging.CRITICAL)
        logging.getLogger("llama_cpp").setLevel(logging.CRITICAL)

        # llama.cpp doesn't like when we have two running simultaneously
        #   so we do a little switcheroo with the tokenizer here
        if hasattr(self, "tokenizer"):
            self.__delattr__("tokenizer")

        lm = GuidanceLlamaCpp(
            self._load_llama_cpp(
                filename=self.filename,
                model_name_or_path=self.model_name_or_path,
                config=self.config,
            ),
            echo=False,
            chat_template=self.config.get("chat_template"),
        )
        self.tokenizer = lm.engine.model_obj.tokenizer_
        return lm