Skip to content

LLMMap

ingredients

Description

This type of ingredient applies a function on a given column to create a new column containing the function's output.

In more formal terms, it is a unary scalar function, much like LENGTH or ABS in standard SQLite.

For example, take the following query.

SELECT merchant FROM transactions
    WHERE {{LLMMap('Is this a pizza shop?', 'transactions::merchant')}} = TRUE

LLMMap is one of our builtin MapIngredients. For each of the distinct values in the "merchant" column of the "transactions" table, it will create a column containing the function output.

merchant Is this a pizza shop?
Domino's 1
Safeway 0
Target 0

The temporary table shown above is then combined with the original "transactions" table with an INNER JOIN on the "merchant" column.

MapProgram

Bases: Program

Source code in blendsql/ingredients/builtin/map/main.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
class MapProgram(Program):
    def __call__(
        self,
        model: Model,
        question: str,
        values: List[str],
        sep: str,
        include_tf_disclaimer: bool = False,
        max_tokens: Optional[int] = None,
        regex: Optional[Callable[[int], str]] = None,
        output_type: Optional[str] = None,
        example_outputs: Optional[str] = None,
        table_title: Optional[str] = None,
        colname: Optional[str] = None,
        **kwargs,
    ) -> Tuple[str, str]:
        if isinstance(model, LocalModel):
            m: guidance.models.Model = model.model_obj
            with guidance.system():
                m += """Given a set of values from a database, answer the question row-by-row, in order."""
                if include_tf_disclaimer:
                    m += " If the question can be answered with 'true' or 'false', select `t` for 'true' or `f` for 'false'."
                m += newline_dedent(
                    f"""
                If a given value has no appropriate answer, give '-' as a response.
                """
                )
                m += newline_dedent(
                    """
                ---

                The following values come from the column 'Penalties (P+P+S+S)', in a table titled 'Biathlon World Championships 2013 \u2013 Men's pursuit'.
                Q: Total penalty count?
                Here are some example outputs: '1', '2', '5'
                A:
                    - 1 (0+0+0+1) -> 1
                    - 10 (5+3+2+0) -> 10
                    - 6 (2+2+2+0) -> 6

                ---

                The following values come from the column 'Length of use', in a table titled 'Crest Whitestrips'.
                Q: Is the time less than a week?
                A:
                    - 14 days -> f
                    - 10 days -> f
                    - daily -> t
                    - 2 hours -> t

                ---
                """
                )
                if table_title:
                    m += newline_dedent(
                        f"The following values come from the column '{colname}', in a table titled '{table_title}'."
                    )
            with guidance.user():
                m += newline_dedent(f"""Q: {question}\nA:\n""")
            prompt = m._current_prompt()
            if isinstance(model, LocalModel) and regex is not None:
                gen_f = lambda: guidance.regex(pattern=regex)
            else:
                gen_f = lambda: guidance.gen(max_tokens=max_tokens or 1000)

            @guidance(stateless=True, dedent=False)
            def make_predictions(lm, values, gen_f) -> guidance.models.Model:
                for _idx, value in enumerate(values):
                    lm += f"\n{value} -> " + guidance.capture(gen_f(), name=value)
                return lm

            with guidance.assistant():
                m += make_predictions(values=values, gen_f=gen_f)
            return ([m[value] for value in values], prompt)
        else:
            # Use the 'old' style of prompting when we have a remote model
            prompt = ""
            prompt += """Given a set of values from a database, answer the question row-by-row, in order."""
            if include_tf_disclaimer:
                prompt += " If the question can be answered with 'true' or 'false', select `t` for 'true' or `f` for 'false'."
            prompt += newline_dedent(
                f"""
                    The answer should be a list separated by '{sep}', and have {len(values)} items in total.
                    When you have given all {len(values)} answers, stop responding.
                    If a given value has no appropriate answer, give '-' as a response.
                    """
            )
            prompt += newline_dedent(
                """
            ---

            The following values come from the column 'Penalties (P+P+S+S)', in a table titled 'Biathlon World Championships 2013 \u2013 Men's pursuit'.
            Q: Total penalty count?
            Values:
            `1 (0+0+0+1)`
            `10 (5+3+2+0)`
            `6 (2+2+2+0)`

            Output type: numeric
            Here are some example outputs: `9;-`

            A: 1;10;6

            ---

            The following values come from the column 'Length of use', in a table titled 'Crest Whitestrips'.
            Q: Is the time less than a week?
            Values:
            `14 days`
            `10 days`
            `daily`
            `2 hours`

            Output type: boolean
            A: f;f;t;t

            ---
            """
            )
            if table_title:
                prompt += newline_dedent(
                    f"The following values come from the column '{colname}', in a table titled '{table_title}'."
                )
            prompt += newline_dedent(f"""Q: {question}\nValues:\n""")
            for value in values:
                prompt += f"`{value}`\n"
            if output_type:
                prompt += f"\nOutput type: {output_type}"
            if example_outputs:
                prompt += f"\nHere are some example outputs: {example_outputs}\n"
            prompt += "\nA:"
            response = generate(model, prompt=prompt, max_tokens=max_tokens or 1000)
            # Post-process language model response
            _r = [
                i.strip()
                for i in response.strip(CONST.DEFAULT_ANS_SEP).split(
                    CONST.DEFAULT_ANS_SEP
                )
            ]
            return (_r, prompt)

LLMMap

Bases: MapIngredient

Source code in blendsql/ingredients/builtin/map/main.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
class LLMMap(MapIngredient):
    DESCRIPTION = """
    If question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column using the scalar function:
        `{{LLMMap('question', 'table::column')}}`
    """

    def run(
        self,
        model: Model,
        question: str,
        values: List[str],
        value_limit: Union[int, None] = None,
        example_outputs: Optional[str] = None,
        output_type: Optional[str] = None,
        regex: Optional[Callable[[int], str]] = None,
        table_to_title: Optional[Dict[str, str]] = None,
        **kwargs,
    ) -> Iterable[Any]:
        """For each value in a given column, calls a Model and retrieves the output.

        Args:
            question: The question to map onto the values. Will also be the new column name
            model: The Model (blender) we will make calls to.
            values: The list of values to apply question to.
            value_limit: Optional limit on the number of values to pass to the Model
            example_outputs: If binary == False, this gives the Model an example of the output we expect.
            output_type: One of 'numeric', 'string', 'bool'
            regex: Optional regex to constrain answer generation.
            table_to_title: Mapping from tablename to a title providing some more context.

        Returns:
            Iterable[Any] containing the output of the Model for each value.
        """
        if model is None:
            raise IngredientException(
                "LLMMap requires a `Model` object, but nothing was passed!\nMost likely you forgot to set the `default_model` argument in `blend()`"
            )
        # Unpack default kwargs
        tablename, colname = self.unpack_default_kwargs(**kwargs)
        # Remote endpoints can't use patterns
        regex = None if isinstance(model, RemoteModel) else regex
        if value_limit is not None:
            values = values[:value_limit]
        values = [value if not pd.isna(value) else "-" for value in values]
        table_title = None
        if table_to_title is not None:
            if tablename not in table_to_title:
                logger.debug(f"Tablename {tablename} not in given table_to_title!")
            else:
                table_title = table_to_title[tablename]
        split_results: List[Union[str, None]] = []
        # Only use tqdm if we're in debug mode
        context_manager: Iterable = (
            tqdm(
                range(0, len(values), CONST.MAP_BATCH_SIZE),
                total=len(values) // CONST.MAP_BATCH_SIZE,
                desc=f"Making calls to Model with batch_size {CONST.MAP_BATCH_SIZE}",
                bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.CYAN, Fore.RESET),
            )
            if logger.level <= logging.DEBUG
            else range(0, len(values), CONST.MAP_BATCH_SIZE)
        )

        for i in context_manager:
            answer_length = len(values[i : i + CONST.MAP_BATCH_SIZE])
            max_tokens = answer_length * 15
            include_tf_disclaimer = False
            curr_batch_values = values[i : i + CONST.MAP_BATCH_SIZE]

            if output_type == "boolean":
                include_tf_disclaimer = True
            elif isinstance(model, OpenaiLLM):
                include_tf_disclaimer = True

            result: List[str] = model.predict(
                program=MapProgram,
                question=question,
                sep=CONST.DEFAULT_ANS_SEP,
                values=curr_batch_values,
                example_outputs=example_outputs,
                output_type=output_type,
                include_tf_disclaimer=include_tf_disclaimer,
                table_title=table_title,
                regex=regex,
                max_tokens=max_tokens,
                **kwargs,
            )
            # Try to map to booleans and `None`
            _r = [
                {
                    "t": True,
                    "f": False,
                    "true": True,
                    "false": False,
                    "y": True,
                    "n": False,
                    "yes": True,
                    "no": False,
                    CONST.DEFAULT_NAN_ANS: None,
                }.get(i.lower(), i)
                for i in result
            ]
            expected_len = len(curr_batch_values)
            if len(_r) != expected_len:
                logger.debug(
                    Fore.YELLOW
                    + f"Mismatch between length of values and answers!\nvalues:{expected_len}, answers:{len(_r)}"
                    + Fore.RESET
                )
                logger.debug(_r)
            split_results.extend(_r)
        for idx, i in enumerate(split_results):
            if i is None:
                continue
            if isinstance(i, str):
                i = i.replace(",", "")
            try:
                split_results[idx] = literal_eval(i)
                assert isinstance(i, (float, int, str))
            except (ValueError, SyntaxError, AssertionError):
                continue
        logger.debug(
            Fore.YELLOW
            + f"Finished LLMMap with values:\n{json.dumps(dict(zip(values[:10], split_results[:10])), indent=4)}"
            + Fore.RESET
        )
        return split_results

run(model, question, values, value_limit=None, example_outputs=None, output_type=None, regex=None, table_to_title=None, **kwargs)

For each value in a given column, calls a Model and retrieves the output.

Parameters:

Name Type Description Default
question str

The question to map onto the values. Will also be the new column name

required
model Model

The Model (blender) we will make calls to.

required
values List[str]

The list of values to apply question to.

required
value_limit Union[int, None]

Optional limit on the number of values to pass to the Model

None
example_outputs Optional[str]

If binary == False, this gives the Model an example of the output we expect.

None
output_type Optional[str]

One of 'numeric', 'string', 'bool'

None
regex Optional[Callable[[int], str]]

Optional regex to constrain answer generation.

None
table_to_title Optional[Dict[str, str]]

Mapping from tablename to a title providing some more context.

None

Returns:

Type Description
Iterable[Any]

Iterable[Any] containing the output of the Model for each value.

Source code in blendsql/ingredients/builtin/map/main.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def run(
    self,
    model: Model,
    question: str,
    values: List[str],
    value_limit: Union[int, None] = None,
    example_outputs: Optional[str] = None,
    output_type: Optional[str] = None,
    regex: Optional[Callable[[int], str]] = None,
    table_to_title: Optional[Dict[str, str]] = None,
    **kwargs,
) -> Iterable[Any]:
    """For each value in a given column, calls a Model and retrieves the output.

    Args:
        question: The question to map onto the values. Will also be the new column name
        model: The Model (blender) we will make calls to.
        values: The list of values to apply question to.
        value_limit: Optional limit on the number of values to pass to the Model
        example_outputs: If binary == False, this gives the Model an example of the output we expect.
        output_type: One of 'numeric', 'string', 'bool'
        regex: Optional regex to constrain answer generation.
        table_to_title: Mapping from tablename to a title providing some more context.

    Returns:
        Iterable[Any] containing the output of the Model for each value.
    """
    if model is None:
        raise IngredientException(
            "LLMMap requires a `Model` object, but nothing was passed!\nMost likely you forgot to set the `default_model` argument in `blend()`"
        )
    # Unpack default kwargs
    tablename, colname = self.unpack_default_kwargs(**kwargs)
    # Remote endpoints can't use patterns
    regex = None if isinstance(model, RemoteModel) else regex
    if value_limit is not None:
        values = values[:value_limit]
    values = [value if not pd.isna(value) else "-" for value in values]
    table_title = None
    if table_to_title is not None:
        if tablename not in table_to_title:
            logger.debug(f"Tablename {tablename} not in given table_to_title!")
        else:
            table_title = table_to_title[tablename]
    split_results: List[Union[str, None]] = []
    # Only use tqdm if we're in debug mode
    context_manager: Iterable = (
        tqdm(
            range(0, len(values), CONST.MAP_BATCH_SIZE),
            total=len(values) // CONST.MAP_BATCH_SIZE,
            desc=f"Making calls to Model with batch_size {CONST.MAP_BATCH_SIZE}",
            bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.CYAN, Fore.RESET),
        )
        if logger.level <= logging.DEBUG
        else range(0, len(values), CONST.MAP_BATCH_SIZE)
    )

    for i in context_manager:
        answer_length = len(values[i : i + CONST.MAP_BATCH_SIZE])
        max_tokens = answer_length * 15
        include_tf_disclaimer = False
        curr_batch_values = values[i : i + CONST.MAP_BATCH_SIZE]

        if output_type == "boolean":
            include_tf_disclaimer = True
        elif isinstance(model, OpenaiLLM):
            include_tf_disclaimer = True

        result: List[str] = model.predict(
            program=MapProgram,
            question=question,
            sep=CONST.DEFAULT_ANS_SEP,
            values=curr_batch_values,
            example_outputs=example_outputs,
            output_type=output_type,
            include_tf_disclaimer=include_tf_disclaimer,
            table_title=table_title,
            regex=regex,
            max_tokens=max_tokens,
            **kwargs,
        )
        # Try to map to booleans and `None`
        _r = [
            {
                "t": True,
                "f": False,
                "true": True,
                "false": False,
                "y": True,
                "n": False,
                "yes": True,
                "no": False,
                CONST.DEFAULT_NAN_ANS: None,
            }.get(i.lower(), i)
            for i in result
        ]
        expected_len = len(curr_batch_values)
        if len(_r) != expected_len:
            logger.debug(
                Fore.YELLOW
                + f"Mismatch between length of values and answers!\nvalues:{expected_len}, answers:{len(_r)}"
                + Fore.RESET
            )
            logger.debug(_r)
        split_results.extend(_r)
    for idx, i in enumerate(split_results):
        if i is None:
            continue
        if isinstance(i, str):
            i = i.replace(",", "")
        try:
            split_results[idx] = literal_eval(i)
            assert isinstance(i, (float, int, str))
        except (ValueError, SyntaxError, AssertionError):
            continue
    logger.debug(
        Fore.YELLOW
        + f"Finished LLMMap with values:\n{json.dumps(dict(zip(values[:10], split_results[:10])), indent=4)}"
        + Fore.RESET
    )
    return split_results