Skip to content

LiteLLM

Environment

In order to use this Model, we expect that you have a .env file created with all required API keys.

Bases: UnconstrainedModel

Class for LiteLLM remote model integration. https://github.com/BerriAI/litellm

Parameters:

Name Type Description Default
model_name_or_path str

Name or identifier of the model to use with LiteLLM. Should begin with provider, e.g. openai/gpt-3.5-turbo, gemini/gemini-2.0-flash-exp, anthropic/claude-3-7-sonnet-20250219.

required
env str

Environment path, defaults to current directory (".")

'.'
config Optional[dict]

Optional dictionary containing model configuration parameters

None
caching bool

Bool determining whether to enable response caching

True
**kwargs

Additional keyword arguments to pass to the model

{}

Examples:

from blendsql.models import LiteLLM
model = LiteLLM("openai/gpt-4o-mini", config={"temperature": 0.7})
Source code in blendsql/models/unconstrained/_litellm.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class LiteLLM(UnconstrainedModel):
    """Class for LiteLLM remote model integration.
    https://github.com/BerriAI/litellm

    Args:
        model_name_or_path: Name or identifier of the model to use with LiteLLM.
            Should begin with provider, e.g. `openai/gpt-3.5-turbo`, `gemini/gemini-2.0-flash-exp`, `anthropic/claude-3-7-sonnet-20250219`.
        env: Environment path, defaults to current directory (".")
        config: Optional dictionary containing model configuration parameters
        caching: Bool determining whether to enable response caching
        **kwargs: Additional keyword arguments to pass to the model

    Examples:
        ```python
        from blendsql.models import LiteLLM
        model = LiteLLM("openai/gpt-4o-mini", config={"temperature": 0.7})
        ```
    """

    def __init__(
        self,
        model_name_or_path: str,
        env: str = ".",
        config: Optional[dict] = None,
        caching: bool = True,
        **kwargs,
    ):
        if config is None:
            config = {}
        super().__init__(
            model_name_or_path=model_name_or_path,
            requires_config=False if model_name_or_path.startswith("ollama") else True,
            config=DEFAULT_CONFIG | config,
            env=env,
            caching=caching,
            **kwargs,
        )

    async def _generate(
        self,
        messages_list: List[List[dict]],
        max_tokens: Optional[int] = None,
        stop_at: Optional[List[str]] = None,
        **kwargs,
    ):
        sem = Semaphore(int(os.getenv(ASYNC_LIMIT_KEY, DEFAULT_ASYNC_LIMIT)))
        async with sem:
            responses = [
                acompletion(
                    model=self.model_name_or_path,
                    messages=messages,
                    max_tokens=max_tokens,
                    stop=stop_at,
                    **self.config,
                )
                for messages in messages_list
            ]
            return [m for m in await asyncio.gather(*responses)]

    def generate(self, *args, **kwargs) -> List[str]:
        """Handles cache lookup and generation using LiteLLM."""
        responses, key = None, None
        if self.caching:
            responses, key = self.check_cache(*args, **kwargs)
        if responses is None:
            responses = asyncio.get_event_loop().run_until_complete(
                self._generate(*args, **kwargs)
            )  # type: ignore
            self.num_generation_calls += 1
        self.prompt_tokens += sum([r.usage.prompt_tokens for r in responses])
        self.completion_tokens += sum([r.usage.completion_tokens for r in responses])
        if self.caching:
            self.cache[key] = responses
        return [r.choices[0].message.content for r in responses]