Execute a BlendSQL Query

blend()

The blend() function is used to execute a BlendSQL query against a database and return the final result, in addition to the intermediate reasoning steps taken. Execution is done on a database given an ingredient context.

Parameters:

Name	Type	Description	Default
`query`	`str`	The BlendSQL query to execute	required
`db`	`Database`	Database connector object	required
`ingredients`	`Optional[Collection[Type[Ingredient]]]`	Collection of ingredient objects, to use in interpreting BlendSQL query	`None`
`verbose`	`bool`	Boolean defining whether to run with logger in debug mode	`False`
`default_model`	`Optional[Model]`	Which BlendSQL model to use in performing ingredient tasks in the current query	`None`
`infer_gen_constraints`	`bool`	Optionally infer the output format of an `IngredientMap` call, given the predicate context For example, in `{{LLMMap('convert to date', 'w::listing date')}} <= '1960-12-31'` We can infer the output format should look like '1960-12-31' and both: 1) Put this string in the `example_outputs` kwarg 2) If we have a LocalModel, pass the '\d{4}-\d{2}-\d{2}' pattern to guidance	`True`
`table_to_title`	`Optional[Dict[str, str]]`	Optional mapping from table name to title of table. Useful for datasets like WikiTableQuestions, where relevant info is stored in table title.	`None`
`schema_qualify`	`bool`	Optional bool, determines if we run qualify_columns() from sqlglot This enables us to write BlendSQL scripts over multi-table databases without manually qualifying columns ourselves However, we need to call `db.sqlglot_schema` if schema_qualify=True, which may add some latency. With single-table queries, we can set this to False.	`True`

Returns:

Name	Type	Description
`smoothie`	`Smoothie`	`Smoothie` dataclass containing pd.DataFrame output and execution metadata

Examples:

import pandas as pd

from blendsql import blend, LLMMap, LLMQA, LLMJoin
from blendsql.db import Pandas
from blendsql.models import TransformersLLM

# Load model
model = TransformersLLM('Qwen/Qwen1.5-0.5B')

# Prepare our local database
db = Pandas(
    {
        "w": pd.DataFrame(
            (
                ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
                ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
                 '23-10'],
                ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
            ),
            columns=['date', 'rival', 'city', 'venue', 'score']
        ),
        "documents": pd.DataFrame(
            (
                ['bathurst, new south wales', 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
                ['sydney', 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
                ['newcastle, new south wales', 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
            ),
            columns=['title', 'content']
        )
    }
)

# Write BlendSQL query
blendsql = """
SELECT * FROM w
WHERE city = {{
    LLMQA(
        'Which city is located 120 miles west of Sydney?',
        (SELECT * FROM documents WHERE content LIKE '%sydney%'),
        options='w::city'
    )
}}
"""
smoothie = blend(
    query=blendsql,
    db=db,
    ingredients={LLMMap, LLMQA, LLMJoin},
    default_model=model,
    # Optional args below
    infer_gen_constraints=True,
    verbose=True
)
print(smoothie.df)
# ┌────────┬───────────────────┬──────────┬─────────────────┬─────────┐
# │ date   │ rival             │ city     │ venue           │ score   │
# ├────────┼───────────────────┼──────────┼─────────────────┼─────────┤
# │ 11 jun │ western districts │ bathurst │ bathurst ground │ 11-0    │
# └────────┴───────────────────┴──────────┴─────────────────┴─────────┘
print(smoothie.meta.prompts)
# [
#   {
#       'answer': 'sydney',
#       'question': 'Which city is located 120 miles west of Sydney?',
#       'context': [
#           {'title': 'bathurst, new south wales', 'content': 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about...'},
#           {'title': 'sydney', 'content': 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in...'}
#       ]
#    }
# ]

Source code in blendsql/blend.py

def blend(
    query: str,
    db: Database,
    default_model: Optional[Model] = None,
    ingredients: Optional[Collection[Type[Ingredient]]] = None,
    verbose: bool = False,
    infer_gen_constraints: bool = True,
    table_to_title: Optional[Dict[str, str]] = None,
    schema_qualify: bool = True,
) -> Smoothie:
    '''The `blend()` function is used to execute a BlendSQL query against a database and
    return the final result, in addition to the intermediate reasoning steps taken.
    Execution is done on a database given an ingredient context.

    Args:
        query: The BlendSQL query to execute
        db: Database connector object
        ingredients: Collection of ingredient objects, to use in interpreting BlendSQL query
        verbose: Boolean defining whether to run with logger in debug mode
        default_model: Which BlendSQL model to use in performing ingredient tasks in the current query
        infer_gen_constraints: Optionally infer the output format of an `IngredientMap` call, given the predicate context
            For example, in `{{LLMMap('convert to date', 'w::listing date')}} <= '1960-12-31'`
            We can infer the output format should look like '1960-12-31' and both:
                1) Put this string in the `example_outputs` kwarg
                2) If we have a LocalModel, pass the '\d{4}-\d{2}-\d{2}' pattern to guidance
        table_to_title: Optional mapping from table name to title of table.
            Useful for datasets like WikiTableQuestions, where relevant info is stored in table title.
        schema_qualify: Optional bool, determines if we run qualify_columns() from sqlglot
            This enables us to write BlendSQL scripts over multi-table databases without manually qualifying columns ourselves
            However, we need to call `db.sqlglot_schema` if schema_qualify=True, which may add some latency.
            With single-table queries, we can set this to False.

    Returns:
        smoothie: `Smoothie` dataclass containing pd.DataFrame output and execution metadata

    Examples:
        ```python
        import pandas as pd

        from blendsql import blend, LLMMap, LLMQA, LLMJoin
        from blendsql.db import Pandas
        from blendsql.models import TransformersLLM

        # Load model
        model = TransformersLLM('Qwen/Qwen1.5-0.5B')

        # Prepare our local database
        db = Pandas(
            {
                "w": pd.DataFrame(
                    (
                        ['11 jun', 'western districts', 'bathurst', 'bathurst ground', '11-0'],
                        ['12 jun', 'wallaroo & university nsq', 'sydney', 'cricket ground',
                         '23-10'],
                        ['5 jun', 'northern districts', 'newcastle', 'sports ground', '29-0']
                    ),
                    columns=['date', 'rival', 'city', 'venue', 'score']
                ),
                "documents": pd.DataFrame(
                    (
                        ['bathurst, new south wales', 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about 200 kilometres ( 120 mi ) west-northwest of sydney and is the seat of the bathurst regional council .'],
                        ['sydney', 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in australia and oceania . located on australia s east coast , the metropolis surrounds port jackson.'],
                        ['newcastle, new south wales', 'the newcastle ( /ˈnuːkɑːsəl/ new-kah-səl ) metropolitan area is the second most populated area in the australian state of new south wales and includes the newcastle and lake macquarie local government areas .']
                    ),
                    columns=['title', 'content']
                )
            }
        )

        # Write BlendSQL query
        blendsql = """
        SELECT * FROM w
        WHERE city = {{
            LLMQA(
                'Which city is located 120 miles west of Sydney?',
                (SELECT * FROM documents WHERE content LIKE '%sydney%'),
                options='w::city'
            )
        }}
        """
        smoothie = blend(
            query=blendsql,
            db=db,
            ingredients={LLMMap, LLMQA, LLMJoin},
            default_model=model,
            # Optional args below
            infer_gen_constraints=True,
            verbose=True
        )
        print(smoothie.df)
        # ┌────────┬───────────────────┬──────────┬─────────────────┬─────────┐
        # │ date   │ rival             │ city     │ venue           │ score   │
        # ├────────┼───────────────────┼──────────┼─────────────────┼─────────┤
        # │ 11 jun │ western districts │ bathurst │ bathurst ground │ 11-0    │
        # └────────┴───────────────────┴──────────┴─────────────────┴─────────┘
        print(smoothie.meta.prompts)
        # [
        #   {
        #       'answer': 'sydney',
        #       'question': 'Which city is located 120 miles west of Sydney?',
        #       'context': [
        #           {'title': 'bathurst, new south wales', 'content': 'bathurst /ˈbæθərst/ is a city in the central tablelands of new south wales , australia . it is about...'},
        #           {'title': 'sydney', 'content': 'sydney ( /ˈsɪdni/ ( listen ) sid-nee ) is the state capital of new south wales and the most populous city in...'}
        #       ]
        #    }
        # ]
        ```
    '''
    if verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.ERROR)
    start = time.time()
    try:
        smoothie = _blend(
            query=query,
            db=db,
            default_model=default_model,
            ingredients=ingredients,
            infer_gen_constraints=infer_gen_constraints,
            table_to_title=table_to_title,
            schema_qualify=schema_qualify,
        )
    except Exception as error:
        raise error
    finally:
        # In the case of a recursive `_blend()` call,
        #   this logic allows temp tables to persist until
        #   the final base case is fulfilled.
        db._reset_connection()
    smoothie.meta.process_time_seconds = time.time() - start
    return smoothie

Appendix

preprocess_blendsql()

Parses BlendSQL string with our pyparsing grammar and returns objects required for interpretation and execution.

Parameters:

Name	Type	Description	Default
`query`	`str`	The BlendSQL query to preprocess	required
`default_model`	`Model`	Model object, which we attach to each parsed_dict	required

Returns:

Type	Description
`Tuple[str, dict, set]`	Tuple, containing: ingredient_alias_to_parsed_dict tables_in_ingredients

Examples:

preprocess_blendsql(
    query="SELECT * FROM documents JOIN {{LLMJoin(left_on='w::player', right_on='documents::title')}} WHERE rank = 2",
    default_model=default_model
)

(
    {
        '{{A()}}': {
            'function': 'LLMJoin',
            'args': [],
            'ingredient_aliasname': 'A',
            'raw': "{{ LLMJoin ( left_on= 'w::player' , right_on= 'documents::title' ) }}",
            'kwargs_dict': {
                'left_on': 'w::player',
                'right_on': 'documents::title'
            }
        }
    },
    {'documents', 'w'}
)

Source code in blendsql/blend.py

def preprocess_blendsql(query: str, default_model: Model) -> Tuple[str, dict, set]:
    """Parses BlendSQL string with our pyparsing grammar and returns objects
    required for interpretation and execution.

    Args:
        query: The BlendSQL query to preprocess
        default_model: Model object, which we attach to each parsed_dict

    Returns:
        Tuple, containing:

            - ingredient_alias_to_parsed_dict

            - tables_in_ingredients

    Examples:
        ```python
        preprocess_blendsql(
            query="SELECT * FROM documents JOIN {{LLMJoin(left_on='w::player', right_on='documents::title')}} WHERE rank = 2",
            default_model=default_model
        )
        ```
        ```text
        (
            {
                '{{A()}}': {
                    'function': 'LLMJoin',
                    'args': [],
                    'ingredient_aliasname': 'A',
                    'raw': "{{ LLMJoin ( left_on= 'w::player' , right_on= 'documents::title' ) }}",
                    'kwargs_dict': {
                        'left_on': 'w::player',
                        'right_on': 'documents::title'
                    }
                }
            },
            {'documents', 'w'}
        )
        ```
    """
    ingredient_alias_to_parsed_dict: Dict[str, dict] = {}
    ingredient_str_to_alias: Dict[str, str] = {}
    tables_in_ingredients: Set[str] = set()
    query = re.sub(r"(\s+)", " ", query)
    reversed_scan_res = [scan_res for scan_res in grammar.scanString(query)][::-1]
    for idx, (parse_results, start, end) in enumerate(reversed_scan_res):
        original_ingredient_string = query[start:end]
        # If we're in between parentheses, add a `SELECT`
        # This way it gets picked up as a subquery to parse later
        # TODO: is this safe to do?
        if query[start - 2] == "(" and query[end + 1] == ")":
            inserted_select = " SELECT "
            query = query[:start] + inserted_select + query[start:end] + query[end:]
            start += len(inserted_select)
            end += len(inserted_select)
        if (
            original_ingredient_string in ingredient_str_to_alias
        ):  # If we've already processed this function, no need to do it again
            substituted_ingredient_alias = ingredient_str_to_alias[
                original_ingredient_string
            ]
        else:
            parsed_results_dict = parse_results.as_dict()
            ingredient_aliasname = string.ascii_uppercase[idx]
            parsed_results_dict["ingredient_aliasname"] = ingredient_aliasname
            substituted_ingredient_alias = "{{" + f"{ingredient_aliasname}()" + "}}"
            ingredient_str_to_alias[
                original_ingredient_string
            ] = substituted_ingredient_alias
            # Remove parentheses at beginning and end of arg/kwarg
            # TODO: this should be handled by pyparsing
            for arg_type in {"args", "kwargs"}:
                for idx in range(len(parsed_results_dict[arg_type])):
                    curr_arg = parsed_results_dict[arg_type][idx]
                    curr_arg = curr_arg[-1] if arg_type == "kwargs" else curr_arg
                    if not isinstance(curr_arg, str):
                        continue
                    formatted_curr_arg = re.sub(
                        r"(^\()(.*)(\)$)", r"\2", curr_arg
                    ).strip()
                    if arg_type == "args":
                        parsed_results_dict[arg_type][idx] = formatted_curr_arg
                    else:
                        parsed_results_dict[arg_type][idx][
                            -1
                        ] = formatted_curr_arg  # kwargs gets returned as ['limit', '=', 10] sort of list
            # So we need to parse by indices in dict expression
            # maybe if I was better at pp.Suppress we wouldn't need this
            kwargs_dict = {x[0]: x[-1] for x in parsed_results_dict["kwargs"]}
            kwargs_dict[IngredientKwarg.MODEL] = default_model
            # Heuristic check to see if we should snag the singleton arg as context
            if (
                len(parsed_results_dict["args"]) == 1
                and "::" in parsed_results_dict["args"][0]
            ):
                kwargs_dict[IngredientKwarg.CONTEXT] = parsed_results_dict["args"].pop()
            context_arg = kwargs_dict.get(
                IngredientKwarg.CONTEXT,
                (
                    parsed_results_dict["args"][1]
                    if len(parsed_results_dict["args"]) > 1
                    else (
                        parsed_results_dict["args"][1]
                        if len(parsed_results_dict["args"]) > 1
                        else None
                    )
                ),
            )
            for arg in {
                context_arg,
                kwargs_dict.get("left_on", None),
                kwargs_dict.get("right_on", None),
            }:
                if arg is None:
                    continue
                if not arg.upper().startswith(("SELECT", "WITH")):
                    tablename, _ = get_tablename_colname(arg)
                    tables_in_ingredients.add(tablename)
            # We don't need raw kwargs anymore
            # in the future, we just refer to kwargs_dict
            parsed_results_dict.pop("kwargs")
            # Below we track the 'raw' representation, in case we need to pass into
            #   a recursive BlendSQL call later
            ingredient_alias_to_parsed_dict[
                substituted_ingredient_alias
            ] = parsed_results_dict | {
                "raw": query[start:end],
                "kwargs_dict": kwargs_dict,
            }
        query = query[:start] + substituted_ingredient_alias + query[end:]
    return (query.strip(), ingredient_alias_to_parsed_dict, tables_in_ingredients)