Skip to content

Execute a BlendSQL Query

blend()

The blend() function is used to execute a BlendSQL query against a database and return the final result, in addition to the intermediate reasoning steps taken. Execution is done on a database given an ingredient context.

Parameters:

Name Type Description Default
query str

The BlendSQL query to execute

required
db Database

Database connector object

required
ingredients Optional[Collection[Type[Ingredient]]]

Collection of ingredient objects, to use in interpreting BlendSQL query

None
verbose bool

Boolean defining whether to run with logger in debug mode

False
default_model Optional[Model]

Which BlendSQL model to use in performing ingredient tasks in the current query

None
infer_gen_constraints bool

Optionally infer the output format of an IngredientMap call, given the predicate context For example, in {{LLMMap('convert to date', 'w::listing date')}} <= '1960-12-31' We can infer the output format should look like '1960-12-31' and both: 1) Put this string in the example_outputs kwarg 2) If we have a LocalModel, pass the '\d{4}-\d{2}-\d{2}' pattern to guidance

True
table_to_title Optional[Dict[str, str]]

Optional mapping from table name to title of table. Useful for datasets like WikiTableQuestions, where relevant info is stored in table title.

None
schema_qualify bool

Optional bool, determines if we run qualify_columns() from sqlglot This enables us to write BlendSQL scripts over multi-table databases without manually qualifying columns ourselves However, we need to call db.sqlglot_schema if schema_qualify=True, which may add some latency. With single-table queries, we can set this to False.

True

Returns:

Name Type Description
smoothie Smoothie

Smoothie dataclass containing pd.DataFrame output and execution metadata

Examples:

import pandas as pd

from blendsql import blend, LLMMap, LLMQA, LLMJoin
from blendsql.db import Pandas
from blendsql.models import TransformersLLM

# Load model
model = TransformersLLM('Qwen/Qwen1.5-0.5B')

# Prepare our local database
db = Pandas(
    {
        "w": pd.DataFrame(
            (
                ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
                ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
                 '23-10'],
                ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
            ),
            columns=['date', 'rival', 'city', 'venue', 'score']
        ),
        "documents": pd.DataFrame(
            (
                ['bathurst, new south wales', 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
                ['sydney', 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
                ['newcastle, new south wales', 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
            ),
            columns=['title', 'content']
        )
    }
)

# Write BlendSQL query
blendsql = """
SELECT * FROM w
WHERE city = {{
    LLMQA(
        'Which city is located 120 miles west of Sydney?',
        (SELECT * FROM documents WHERE content LIKE '%sydney%'),
        options='w::city'
    )
}}
"""
smoothie = blend(
    query=blendsql,
    db=db,
    ingredients={LLMMap, LLMQA, LLMJoin},
    default_model=model,
    # Optional args below
    infer_gen_constraints=True,
    verbose=True
)
print(smoothie.df)
# ┌────────┬───────────────────┬──────────┬─────────────────┬─────────┐
# │ date   │ rival             │ city     │ venue           │ score   │
# ├────────┼───────────────────┼──────────┼─────────────────┼─────────┤
# │ 11 jun │ western districts │ bathurst │ bathurst ground │ 11-0    │
# └────────┴───────────────────┴──────────┴─────────────────┴─────────┘
print(smoothie.meta.prompts)
# [
#   {
#       'answer': 'sydney',
#       'question': 'Which city is located 120 miles west of Sydney?',
#       'context': [
#           {'title': 'bathurst, new south wales', 'content': 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about...'},
#           {'title': 'sydney', 'content': 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in...'}
#       ]
#    }
# ]
Source code in blendsql/blend.py
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
def blend(
    query: str,
    db: Database,
    default_model: Optional[Model] = None,
    ingredients: Optional[Collection[Type[Ingredient]]] = None,
    verbose: bool = False,
    infer_gen_constraints: bool = True,
    table_to_title: Optional[Dict[str, str]] = None,
    schema_qualify: bool = True,
) -> Smoothie:
    '''The `blend()` function is used to execute a BlendSQL query against a database and
    return the final result, in addition to the intermediate reasoning steps taken.
    Execution is done on a database given an ingredient context.

    Args:
        query: The BlendSQL query to execute
        db: Database connector object
        ingredients: Collection of ingredient objects, to use in interpreting BlendSQL query
        verbose: Boolean defining whether to run with logger in debug mode
        default_model: Which BlendSQL model to use in performing ingredient tasks in the current query
        infer_gen_constraints: Optionally infer the output format of an `IngredientMap` call, given the predicate context
            For example, in `{{LLMMap('convert to date', 'w::listing date')}} <= '1960-12-31'`
            We can infer the output format should look like '1960-12-31' and both:
                1) Put this string in the `example_outputs` kwarg
                2) If we have a LocalModel, pass the '\d{4}-\d{2}-\d{2}' pattern to guidance
        table_to_title: Optional mapping from table name to title of table.
            Useful for datasets like WikiTableQuestions, where relevant info is stored in table title.
        schema_qualify: Optional bool, determines if we run qualify_columns() from sqlglot
            This enables us to write BlendSQL scripts over multi-table databases without manually qualifying columns ourselves
            However, we need to call `db.sqlglot_schema` if schema_qualify=True, which may add some latency.
            With single-table queries, we can set this to False.

    Returns:
        smoothie: `Smoothie` dataclass containing pd.DataFrame output and execution metadata

    Examples:
        ```python
        import pandas as pd

        from blendsql import blend, LLMMap, LLMQA, LLMJoin
        from blendsql.db import Pandas
        from blendsql.models import TransformersLLM

        # Load model
        model = TransformersLLM('Qwen/Qwen1.5-0.5B')

        # Prepare our local database
        db = Pandas(
            {
                "w": pd.DataFrame(
                    (
                        ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
                        ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
                         '23-10'],
                        ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
                    ),
                    columns=['date', 'rival', 'city', 'venue', 'score']
                ),
                "documents": pd.DataFrame(
                    (
                        ['bathurst, new south wales', 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
                        ['sydney', 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
                        ['newcastle, new south wales', 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
                    ),
                    columns=['title', 'content']
                )
            }
        )

        # Write BlendSQL query
        blendsql = """
        SELECT * FROM w
        WHERE city = {{
            LLMQA(
                'Which city is located 120 miles west of Sydney?',
                (SELECT * FROM documents WHERE content LIKE '%sydney%'),
                options='w::city'
            )
        }}
        """
        smoothie = blend(
            query=blendsql,
            db=db,
            ingredients={LLMMap, LLMQA, LLMJoin},
            default_model=model,
            # Optional args below
            infer_gen_constraints=True,
            verbose=True
        )
        print(smoothie.df)
        # ┌────────┬───────────────────┬──────────┬─────────────────┬─────────┐
        # │ date   │ rival             │ city     │ venue           │ score   │
        # ├────────┼───────────────────┼──────────┼─────────────────┼─────────┤
        # │ 11 jun │ western districts │ bathurst │ bathurst ground │ 11-0    │
        # └────────┴───────────────────┴──────────┴─────────────────┴─────────┘
        print(smoothie.meta.prompts)
        # [
        #   {
        #       'answer': 'sydney',
        #       'question': 'Which city is located 120 miles west of Sydney?',
        #       'context': [
        #           {'title': 'bathurst, new south wales', 'content': 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about...'},
        #           {'title': 'sydney', 'content': 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in...'}
        #       ]
        #    }
        # ]
        ```
    '''
    if verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.ERROR)
    start = time.time()
    try:
        smoothie = _blend(
            query=query,
            db=db,
            default_model=default_model,
            ingredients=ingredients,
            infer_gen_constraints=infer_gen_constraints,
            table_to_title=table_to_title,
            schema_qualify=schema_qualify,
        )
    except Exception as error:
        raise error
    finally:
        # In the case of a recursive `_blend()` call,
        #   this logic allows temp tables to persist until
        #   the final base case is fulfilled.
        db._reset_connection()
    smoothie.meta.process_time_seconds = time.time() - start
    return smoothie

Appendix

preprocess_blendsql()

Parses BlendSQL string with our pyparsing grammar and returns objects required for interpretation and execution.

Parameters:

Name Type Description Default
query str

The BlendSQL query to preprocess

required
default_model Model

Model object, which we attach to each parsed_dict

required

Returns:

Type Description
Tuple[str, dict, set]

Tuple, containing:

  • ingredient_alias_to_parsed_dict

  • tables_in_ingredients

Examples:

preprocess_blendsql(
    query="SELECT * FROM documents JOIN {{LLMJoin(left_on='w::player', right_on='documents::title')}} WHERE rank = 2",
    default_model=default_model
)
(
    {
        '{{A()}}': {
            'function': 'LLMJoin',
            'args': [],
            'ingredient_aliasname': 'A',
            'raw': "{{ LLMJoin ( left_on= 'w::player' , right_on= 'documents::title' ) }}",
            'kwargs_dict': {
                'left_on': 'w::player',
                'right_on': 'documents::title'
            }
        }
    },
    {'documents', 'w'}
)

Source code in blendsql/blend.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def preprocess_blendsql(query: str, default_model: Model) -> Tuple[str, dict, set]:
    """Parses BlendSQL string with our pyparsing grammar and returns objects
    required for interpretation and execution.

    Args:
        query: The BlendSQL query to preprocess
        default_model: Model object, which we attach to each parsed_dict

    Returns:
        Tuple, containing:

            - ingredient_alias_to_parsed_dict

            - tables_in_ingredients

    Examples:
        ```python
        preprocess_blendsql(
            query="SELECT * FROM documents JOIN {{LLMJoin(left_on='w::player', right_on='documents::title')}} WHERE rank = 2",
            default_model=default_model
        )
        ```
        ```text
        (
            {
                '{{A()}}': {
                    'function': 'LLMJoin',
                    'args': [],
                    'ingredient_aliasname': 'A',
                    'raw': "{{ LLMJoin ( left_on= 'w::player' , right_on= 'documents::title' ) }}",
                    'kwargs_dict': {
                        'left_on': 'w::player',
                        'right_on': 'documents::title'
                    }
                }
            },
            {'documents', 'w'}
        )
        ```
    """
    ingredient_alias_to_parsed_dict: Dict[str, dict] = {}
    ingredient_str_to_alias: Dict[str, str] = {}
    tables_in_ingredients: Set[str] = set()
    query = re.sub(r"(\s+)", " ", query)
    reversed_scan_res = [scan_res for scan_res in grammar.scanString(query)][::-1]
    for idx, (parse_results, start, end) in enumerate(reversed_scan_res):
        original_ingredient_string = query[start:end]
        # If we're in between parentheses, add a `SELECT`
        # This way it gets picked up as a subquery to parse later
        # TODO: is this safe to do?
        if query[start - 2] == "(" and query[end + 1] == ")":
            inserted_select = " SELECT "
            query = query[:start] + inserted_select + query[start:end] + query[end:]
            start += len(inserted_select)
            end += len(inserted_select)
        if (
            original_ingredient_string in ingredient_str_to_alias
        ):  # If we've already processed this function, no need to do it again
            substituted_ingredient_alias = ingredient_str_to_alias[
                original_ingredient_string
            ]
        else:
            parsed_results_dict = parse_results.as_dict()
            ingredient_aliasname = string.ascii_uppercase[idx]
            parsed_results_dict["ingredient_aliasname"] = ingredient_aliasname
            substituted_ingredient_alias = "{{" + f"{ingredient_aliasname}()" + "}}"
            ingredient_str_to_alias[
                original_ingredient_string
            ] = substituted_ingredient_alias
            # Remove parentheses at beginning and end of arg/kwarg
            # TODO: this should be handled by pyparsing
            for arg_type in {"args", "kwargs"}:
                for idx in range(len(parsed_results_dict[arg_type])):
                    curr_arg = parsed_results_dict[arg_type][idx]
                    curr_arg = curr_arg[-1] if arg_type == "kwargs" else curr_arg
                    if not isinstance(curr_arg, str):
                        continue
                    formatted_curr_arg = re.sub(
                        r"(^\()(.*)(\)$)", r"\2", curr_arg
                    ).strip()
                    if arg_type == "args":
                        parsed_results_dict[arg_type][idx] = formatted_curr_arg
                    else:
                        parsed_results_dict[arg_type][idx][
                            -1
                        ] = formatted_curr_arg  # kwargs gets returned as ['limit', '=', 10] sort of list
            # So we need to parse by indices in dict expression
            # maybe if I was better at pp.Suppress we wouldn't need this
            kwargs_dict = {x[0]: x[-1] for x in parsed_results_dict["kwargs"]}
            kwargs_dict[IngredientKwarg.MODEL] = default_model
            # Heuristic check to see if we should snag the singleton arg as context
            if (
                len(parsed_results_dict["args"]) == 1
                and "::" in parsed_results_dict["args"][0]
            ):
                kwargs_dict[IngredientKwarg.CONTEXT] = parsed_results_dict["args"].pop()
            context_arg = kwargs_dict.get(
                IngredientKwarg.CONTEXT,
                (
                    parsed_results_dict["args"][1]
                    if len(parsed_results_dict["args"]) > 1
                    else (
                        parsed_results_dict["args"][1]
                        if len(parsed_results_dict["args"]) > 1
                        else None
                    )
                ),
            )
            for arg in {
                context_arg,
                kwargs_dict.get("left_on", None),
                kwargs_dict.get("right_on", None),
            }:
                if arg is None:
                    continue
                if not arg.upper().startswith(("SELECT", "WITH")):
                    tablename, _ = get_tablename_colname(arg)
                    tables_in_ingredients.add(tablename)
            # We don't need raw kwargs anymore
            # in the future, we just refer to kwargs_dict
            parsed_results_dict.pop("kwargs")
            # Below we track the 'raw' representation, in case we need to pass into
            #   a recursive BlendSQL call later
            ingredient_alias_to_parsed_dict[
                substituted_ingredient_alias
            ] = parsed_results_dict | {
                "raw": query[start:end],
                "kwargs_dict": kwargs_dict,
            }
        query = query[:start] + substituted_ingredient_alias + query[end:]
    return (query.strip(), ingredient_alias_to_parsed_dict, tables_in_ingredients)