Skip to content

Execute a BlendSQL Query

blend()

The blend() function is used to execute a BlendSQL query against a database and return the final result, in addition to the intermediate reasoning steps taken. Execution is done on a database given an ingredient context.

Parameters:

Name Type Description Default
query str

The BlendSQL query to execute

required
db Database

Database connector object

required
ingredients Optional[Collection[Type[Ingredient]]]

Collection of ingredient objects, to use in interpreting BlendSQL query

None
verbose bool

Boolean defining whether to run with logger in debug mode

False
default_model Optional[Model]

Which BlendSQL model to use in performing ingredient tasks in the current query

None
infer_gen_constraints bool

Optionally infer the output format of an IngredientMap call, given the predicate context For example, in {{LLMMap('convert to date', 'w::listing date')}} <= '1960-12-31' We can infer the output format should look like '1960-12-31' and both: 1) Put this string in the example_outputs kwarg 2) If we have a LocalModel, pass the '\d{4}-\d{2}-\d{2}' pattern to guidance

True
table_to_title Optional[Dict[str, str]]

Optional mapping from table name to title of table. Useful for datasets like WikiTableQuestions, where relevant info is stored in table title.

None
schema_qualify bool

Optional bool, determines if we run qualify_columns() from sqlglot This enables us to write BlendSQL scripts over multi-table databases without manually qualifying columns ourselves However, we need to call db.sqlglot_schema if schema_qualify=True, which may add some latency. With single-table queries, we can set this to False.

True

Returns:

Name Type Description
smoothie Smoothie

Smoothie dataclass containing pd.DataFrame output and execution metadata

Examples:

import pandas as pd

from blendsql import blend, LLMMap, LLMQA, LLMJoin
from blendsql.db import Pandas
from blendsql.models import TransformersLLM

# Load model
model = TransformersLLM('Qwen/Qwen1.5-0.5B')

# Prepare our local database
db = Pandas(
    {
        "w": pd.DataFrame(
            (
                ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
                ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
                 '23-10'],
                ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
            ),
            columns=['date', 'rival', 'city', 'venue', 'score']
        ),
        "documents": pd.DataFrame(
            (
                ['bathurst, new south wales', 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
                ['sydney', 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
                ['newcastle, new south wales', 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
            ),
            columns=['title', 'content']
        )
    }
)

# Write BlendSQL query
blendsql = """
SELECT * FROM w
WHERE city = {{
    LLMQA(
        'Which city is located 120 miles west of Sydney?',
        (SELECT * FROM documents WHERE content LIKE '%sydney%'),
        options='w::city'
    )
}}
"""
smoothie = blend(
    query=blendsql,
    db=db,
    ingredients={LLMMap, LLMQA, LLMJoin},
    default_model=model,
    # Optional args below
    infer_gen_constraints=True,
    verbose=True
)
print(smoothie.df)
# ┌────────┬───────────────────┬──────────┬─────────────────┬─────────┐
# │ date   │ rival             │ city     │ venue           │ score   │
# ├────────┼───────────────────┼──────────┼─────────────────┼─────────┤
# │ 11 jun │ western districts │ bathurst │ bathurst ground │ 11-0    │
# └────────┴───────────────────┴──────────┴─────────────────┴─────────┘
print(smoothie.meta.prompts)
# [
#   {
#       'answer': 'sydney',
#       'question': 'Which city is located 120 miles west of Sydney?',
#       'context': [
#           {'title': 'bathurst, new south wales', 'content': 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about...'},
#           {'title': 'sydney', 'content': 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in...'}
#       ]
#    }
# ]
Source code in blendsql/blend.py
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
def blend(
    query: str,
    db: Database,
    default_model: Optional[Model] = None,
    ingredients: Optional[Collection[Type[Ingredient]]] = None,
    verbose: bool = False,
    infer_gen_constraints: bool = True,
    table_to_title: Optional[Dict[str, str]] = None,
    schema_qualify: bool = True,
) -> Smoothie:
    '''The `blend()` function is used to execute a BlendSQL query against a database and
    return the final result, in addition to the intermediate reasoning steps taken.
    Execution is done on a database given an ingredient context.

    Args:
        query: The BlendSQL query to execute
        db: Database connector object
        ingredients: Collection of ingredient objects, to use in interpreting BlendSQL query
        verbose: Boolean defining whether to run with logger in debug mode
        default_model: Which BlendSQL model to use in performing ingredient tasks in the current query
        infer_gen_constraints: Optionally infer the output format of an `IngredientMap` call, given the predicate context
            For example, in `{{LLMMap('convert to date', 'w::listing date')}} <= '1960-12-31'`
            We can infer the output format should look like '1960-12-31' and both:
                1) Put this string in the `example_outputs` kwarg
                2) If we have a LocalModel, pass the '\d{4}-\d{2}-\d{2}' pattern to guidance
        table_to_title: Optional mapping from table name to title of table.
            Useful for datasets like WikiTableQuestions, where relevant info is stored in table title.
        schema_qualify: Optional bool, determines if we run qualify_columns() from sqlglot
            This enables us to write BlendSQL scripts over multi-table databases without manually qualifying columns ourselves
            However, we need to call `db.sqlglot_schema` if schema_qualify=True, which may add some latency.
            With single-table queries, we can set this to False.

    Returns:
        smoothie: `Smoothie` dataclass containing pd.DataFrame output and execution metadata

    Examples:
        ```python
        import pandas as pd

        from blendsql import blend, LLMMap, LLMQA, LLMJoin
        from blendsql.db import Pandas
        from blendsql.models import TransformersLLM

        # Load model
        model = TransformersLLM('Qwen/Qwen1.5-0.5B')

        # Prepare our local database
        db = Pandas(
            {
                "w": pd.DataFrame(
                    (
                        ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
                        ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
                         '23-10'],
                        ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
                    ),
                    columns=['date', 'rival', 'city', 'venue', 'score']
                ),
                "documents": pd.DataFrame(
                    (
                        ['bathurst, new south wales', 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
                        ['sydney', 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
                        ['newcastle, new south wales', 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
                    ),
                    columns=['title', 'content']
                )
            }
        )

        # Write BlendSQL query
        blendsql = """
        SELECT * FROM w
        WHERE city = {{
            LLMQA(
                'Which city is located 120 miles west of Sydney?',
                (SELECT * FROM documents WHERE content LIKE '%sydney%'),
                options='w::city'
            )
        }}
        """
        smoothie = blend(
            query=blendsql,
            db=db,
            ingredients={LLMMap, LLMQA, LLMJoin},
            default_model=model,
            # Optional args below
            infer_gen_constraints=True,
            verbose=True
        )
        print(smoothie.df)
        # ┌────────┬───────────────────┬──────────┬─────────────────┬─────────┐
        # │ date   │ rival             │ city     │ venue           │ score   │
        # ├────────┼───────────────────┼──────────┼─────────────────┼─────────┤
        # │ 11 jun │ western districts │ bathurst │ bathurst ground │ 11-0    │
        # └────────┴───────────────────┴──────────┴─────────────────┴─────────┘
        print(smoothie.meta.prompts)
        # [
        #   {
        #       'answer': 'sydney',
        #       'question': 'Which city is located 120 miles west of Sydney?',
        #       'context': [
        #           {'title': 'bathurst, new south wales', 'content': 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about...'},
        #           {'title': 'sydney', 'content': 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in...'}
        #       ]
        #    }
        # ]
        ```
    '''
    if verbose:
        logger.setLevel(logging.DEBUG)
        for handler in logger.handlers:
            handler.setLevel(logging.DEBUG)
    start = time.time()
    try:
        smoothie = _blend(
            query=query,
            db=db,
            default_model=default_model,
            ingredients=ingredients,
            infer_gen_constraints=infer_gen_constraints,
            table_to_title=table_to_title,
            schema_qualify=schema_qualify,
        )
    except Exception as error:
        raise error
    finally:
        # In the case of a recursive `_blend()` call,
        #   this logic allows temp tables to persist until
        #   the final base case is fulfilled.
        db._reset_connection()
    smoothie.meta.process_time_seconds = time.time() - start
    return smoothie

Appendix

preprocess_blendsql()

Parses BlendSQL string with our pyparsing grammar and returns objects required for interpretation and execution.

Parameters:

Name Type Description Default
query str

The BlendSQL query to preprocess

required
default_model Model

Model object, which we attach to each parsed_dict

required

Returns:

Type Description
Tuple[str, dict, set, Kitchen, Set[Ingredient]]

Tuple, containing:

  • ingredient_alias_to_parsed_dict

  • tables_in_ingredients

Examples:

preprocess_blendsql(
    query="SELECT * FROM documents JOIN {{LLMJoin(left_on='w::player', right_on='documents::title')}} WHERE rank = 2",
    default_model=default_model
)
(
    {
        '{{A()}}': {
            'function': 'LLMJoin',
            'args': [],
            'ingredient_aliasname': 'A',
            'raw': "{{ LLMJoin ( left_on= 'w::player' , right_on= 'documents::title' ) }}",
            'kwargs_dict': {
                'left_on': 'w::player',
                'right_on': 'documents::title'
            }
        }
    },
    {'documents', 'w'}
)

Source code in blendsql/blend.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
def preprocess_blendsql(
    query: str,
    kitchen: Kitchen,
    ingredients: Collection[Ingredient],
    default_model: Model,
) -> Tuple[str, dict, set, Kitchen, Set[Ingredient]]:
    """Parses BlendSQL string with our pyparsing grammar and returns objects
    required for interpretation and execution.

    Args:
        query: The BlendSQL query to preprocess
        default_model: Model object, which we attach to each parsed_dict

    Returns:
        Tuple, containing:

            - ingredient_alias_to_parsed_dict

            - tables_in_ingredients

    Examples:
        ```python
        preprocess_blendsql(
            query="SELECT * FROM documents JOIN {{LLMJoin(left_on='w::player', right_on='documents::title')}} WHERE rank = 2",
            default_model=default_model
        )
        ```
        ```text
        (
            {
                '{{A()}}': {
                    'function': 'LLMJoin',
                    'args': [],
                    'ingredient_aliasname': 'A',
                    'raw': "{{ LLMJoin ( left_on= 'w::player' , right_on= 'documents::title' ) }}",
                    'kwargs_dict': {
                        'left_on': 'w::player',
                        'right_on': 'documents::title'
                    }
                }
            },
            {'documents', 'w'}
        )
        ```
    """
    ingredient_alias_to_parsed_dict: Dict[str, dict] = {}
    ingredient_str_to_alias: Dict[str, str] = {}
    tables_in_ingredients: Set[str] = set()
    query = re.sub(r"(\s+)", " ", query)
    reversed_scan_res = [scan_res for scan_res in grammar.scanString(query)][::-1]
    for idx, (parse_results, start, end) in enumerate(reversed_scan_res):
        original_ingredient_string = query[start:end]
        # If we're in between parentheses, add a `SELECT`
        # This way it gets picked up as a subquery to parse later
        # TODO: is this safe to do?
        if query[start - 2] == "(" and query[end + 1] == ")":
            inserted_select = " SELECT "
            query = query[:start] + inserted_select + query[start:end] + query[end:]
            start += len(inserted_select)
            end += len(inserted_select)
        if (
            original_ingredient_string in ingredient_str_to_alias
        ):  # If we've already processed this function, no need to do it again
            substituted_ingredient_alias = ingredient_str_to_alias[
                original_ingredient_string
            ]
        else:
            parsed_results_dict = parse_results.as_dict()
            _ingredient = kitchen.get_from_name(parsed_results_dict["function"])
            if _ingredient.ingredient_type == IngredientType.ALIAS:
                _original_ingredient_string, ingredient_dependencies = _ingredient(
                    *parsed_results_dict["args"],
                    **{x[0]: x[-1] for x in parsed_results_dict["kwargs"]},
                )
                kitchen.extend(ingredient_dependencies, flag_duplicates=False)
                ingredients = set(ingredients)
                existing_ingredients = set([i.__name__ for i in ingredients])
                for ingredient in ingredient_dependencies:
                    if ingredient.__name__ not in existing_ingredients:
                        ingredients.add(ingredient)
                parse_results = grammar.parseString(_original_ingredient_string)
                parsed_results_dict = parse_results.as_dict()
                logger.debug(
                    Fore.CYAN
                    + "Unpacked alias `"
                    + Fore.LIGHTCYAN_EX
                    + original_ingredient_string
                    + Fore.CYAN
                    + "` to `"
                    + Fore.LIGHTCYAN_EX
                    + _original_ingredient_string
                    + "`"
                    + Fore.RESET
                )
                original_ingredient_string = _original_ingredient_string
            ingredient_aliasname = string.ascii_uppercase[idx]
            parsed_results_dict["ingredient_aliasname"] = ingredient_aliasname
            substituted_ingredient_alias = "{{" + f"{ingredient_aliasname}()" + "}}"
            ingredient_str_to_alias[
                original_ingredient_string
            ] = substituted_ingredient_alias
            # Remove parentheses at beginning and end of arg/kwarg
            # TODO: this should be handled by pyparsing
            for arg_type in {"args", "kwargs"}:
                for idx in range(len(parsed_results_dict[arg_type])):
                    curr_arg = parsed_results_dict[arg_type][idx]
                    curr_arg = curr_arg[-1] if arg_type == "kwargs" else curr_arg
                    if not isinstance(curr_arg, str):
                        continue
                    formatted_curr_arg = re.sub(
                        r"(^\()(.*)(\)$)", r"\2", curr_arg
                    ).strip()
                    if arg_type == "args":
                        parsed_results_dict[arg_type][idx] = formatted_curr_arg
                    else:
                        parsed_results_dict[arg_type][idx][
                            -1
                        ] = formatted_curr_arg  # kwargs gets returned as ['limit', '=', 10] sort of list
            # So we need to parse by indices in dict expression
            # maybe if I was better at pp.Suppress we wouldn't need this
            kwargs_dict = {x[0]: x[-1] for x in parsed_results_dict["kwargs"]}
            kwargs_dict[IngredientKwarg.MODEL] = default_model
            # Heuristic check to see if we should snag the singleton arg as context
            if (
                len(parsed_results_dict["args"]) == 1
                and "::" in parsed_results_dict["args"][0]
            ):
                kwargs_dict[IngredientKwarg.CONTEXT] = parsed_results_dict["args"].pop()
            context_arg = kwargs_dict.get(
                IngredientKwarg.CONTEXT,
                (
                    parsed_results_dict["args"][1]
                    if len(parsed_results_dict["args"]) > 1
                    else (
                        parsed_results_dict["args"][1]
                        if len(parsed_results_dict["args"]) > 1
                        else None
                    )
                ),
            )
            for arg in {
                context_arg,
                kwargs_dict.get("left_on", None),
                kwargs_dict.get("right_on", None),
            }:
                if arg is None:
                    continue
                if not check.is_blendsql_query(arg):
                    tablename, _ = get_tablename_colname(arg)
                    tables_in_ingredients.add(tablename)
            # We don't need raw kwargs anymore
            # in the future, we just refer to kwargs_dict
            parsed_results_dict.pop("kwargs")
            # Below we track the 'raw' representation, in case we need to pass into
            #   a recursive BlendSQL call later
            ingredient_alias_to_parsed_dict[
                substituted_ingredient_alias
            ] = parsed_results_dict | {
                "raw": query[start:end],
                "kwargs_dict": kwargs_dict,
            }
        query = query[:start] + substituted_ingredient_alias + query[end:]
    return (
        query.strip(),
        ingredient_alias_to_parsed_dict,
        tables_in_ingredients,
        kitchen,
        ingredients,
    )