Teaching BlendSQL via In-Context Learning¶

As described in our paper, the real power of BlendSQL comes when it is used as an intermediate representation for tasks requiring complex reasoning across many different forms of data.

In this notebook, we show an example of how we can 'teach' an instruction-finetuned language model how to write with this new dialect of SQL. Our pipeline can be summarized as:

Define few-shot examples, using our dataset
Design a prompt for our Parser LLM, which explains the task we want it to achieve
Call our Parser with our prompt + a question to get a BlendSQL query
Execute the BlendSQL query with blend() to retrieve the final answer

In [14]:

Copied!





from typing import List
from textwrap import dedent
import outlines

from blendsql import blend
from blendsql.ingredients import LLMMap, LLMJoin, LLMQA
from blendsql.models import OpenaiLLM
from blendsql.models._model import Model
from blendsql._program import Program
from blendsql.db import SQLite
from blendsql.utils import fetch_from_hub
from typing import List
from textwrap import dedent
import outlines

from blendsql import blend
from blendsql.ingredients import LLMMap, LLMJoin, LLMQA
from blendsql.models import OpenaiLLM
from blendsql.models._model import Model
from blendsql._program import Program
from blendsql.db import SQLite
from blendsql.utils import fetch_from_hub

In [15]:

Copied!





# 1) Define our few-shot examples
examples = [
   {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "name" TEXT,\n  "province" TEXT,\n  "city" TEXT,\n  "year" TEXT,\n  "remarks" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index                      name          province     city year                                                         remarks\n     0       abdul rahman mosque    kabul province    kabul 2009                                   largest mosque in afghanistan\n     1 friday mosque of kandahar kandahar province kandahar 1750                houses the cloak of the islamic prophet muhammad\n     2     omar al-farooq mosque kandahar province kandahar 2014 built on the site that was a popular cinema of kandahar . [ 1 ]\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "Who were the builders of the mosque in Herat with fire temples ?",
        "blendsql": """
        {{
            LLMQA(
                'Who were the builders of the mosque?',
                (
                    SELECT documents.title AS 'Building', documents.content FROM documents
                    JOIN {{
                        LLMJoin(
                            left_on='w::name',
                            right_on='documents::title'
                        )
                    }}
                    WHERE w.city = 'herat' AND w.remarks LIKE '%fire temple%'
                )
            )
        }}
        """,
    },
    {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "no" INTEGER,\n  "rider" TEXT,\n  "team" TEXT,\n  "motorcycle" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index  no          rider                 team      motorcycle\n     0   1   carl fogarty   ducati performance      ducati 996\n     1   4 akira yanagawa kawasaki racing team kawasaki zx-7rr\n     2   5  colin edwards        castrol honda      honda rc45\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "After what season did the number 7 competitor retire ?",
        "blendsql": """
        {{
            LLMQA(
                'When did the competitor retire?',
                (
                    SELECT documents.title AS 'Competitor', documents.content FROM documents
                    JOIN {{
                        LLMJoin(
                            left_on='w::rider',
                            right_on='documents::title'
                        )
                    }}
                    WHERE w.no = 7
                )
            )
        }}
        """,
    },
    {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "year" TEXT,\n  "winner" TEXT,\n  "position" TEXT,\n  "school" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index    year         winner   position     school\n     0 1961-62       ron ryan right wing      colby\n     1 1962-63 bob brinkworth     center rensselaer\n     2 1963-64 bob brinkworth     center rensselaer\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "What year was the 1971-72 ECAC Hockey Player of the Year born ?",
        "blendsql": """
        {{
            LLMQA(
                'What year was the player born?',
                (
                    SELECT documents.title AS 'Player', documents.content FROM documents
                    JOIN {{
                        LLMJoin(
                            left_on = 'w::winner',
                            right_on = 'documents::title'
                        )
                    }}
                    WHERE w.year = '1971-72'
                )
            )
        }}
        """,
    },
    {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "date" TEXT,\n  "language" TEXT,\n  "language family" TEXT,\n  "region" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index                     date language language family      region\n     0 early 2nd millennium bce sumerian         isolate mesopotamia\n     1       2nd millennium bce  eblaite         semitic       syria\n     2            ca . 1100 bce  hittite       anatolian    anatolia\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "What was the language family that was used in Hattusa , as well as parts of the northern Levant and Upper Mesopotamia ?",
        "blendsql": """
        SELECT "language family" FROM w
        WHERE language = {{
            LLMQA(
                'Which language was used in Hattusa, as well as parts of the northern Levant and Upper Mesopotamia ?',
                (SELECT title, content FROM documents WHERE documents MATCH 'hattusa'),
                options='w::language'
            )
        }}
       """,
    },
]
# 1) Define our few-shot examples
examples = [
   {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "name" TEXT,\n  "province" TEXT,\n  "city" TEXT,\n  "year" TEXT,\n  "remarks" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index                      name          province     city year                                                         remarks\n     0       abdul rahman mosque    kabul province    kabul 2009                                   largest mosque in afghanistan\n     1 friday mosque of kandahar kandahar province kandahar 1750                houses the cloak of the islamic prophet muhammad\n     2     omar al-farooq mosque kandahar province kandahar 2014 built on the site that was a popular cinema of kandahar . [ 1 ]\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "Who were the builders of the mosque in Herat with fire temples ?",
        "blendsql": """
        {{
            LLMQA(
                'Who were the builders of the mosque?',
                (
                    SELECT documents.title AS 'Building', documents.content FROM documents
                    JOIN {{
                        LLMJoin(
                            left_on='w::name',
                            right_on='documents::title'
                        )
                    }}
                    WHERE w.city = 'herat' AND w.remarks LIKE '%fire temple%'
                )
            )
        }}
        """,
    },
    {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "no" INTEGER,\n  "rider" TEXT,\n  "team" TEXT,\n  "motorcycle" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index  no          rider                 team      motorcycle\n     0   1   carl fogarty   ducati performance      ducati 996\n     1   4 akira yanagawa kawasaki racing team kawasaki zx-7rr\n     2   5  colin edwards        castrol honda      honda rc45\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "After what season did the number 7 competitor retire ?",
        "blendsql": """
        {{
            LLMQA(
                'When did the competitor retire?',
                (
                    SELECT documents.title AS 'Competitor', documents.content FROM documents
                    JOIN {{
                        LLMJoin(
                            left_on='w::rider',
                            right_on='documents::title'
                        )
                    }}
                    WHERE w.no = 7
                )
            )
        }}
        """,
    },
    {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "year" TEXT,\n  "winner" TEXT,\n  "position" TEXT,\n  "school" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index    year         winner   position     school\n     0 1961-62       ron ryan right wing      colby\n     1 1962-63 bob brinkworth     center rensselaer\n     2 1963-64 bob brinkworth     center rensselaer\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "What year was the 1971-72 ECAC Hockey Player of the Year born ?",
        "blendsql": """
        {{
            LLMQA(
                'What year was the player born?',
                (
                    SELECT documents.title AS 'Player', documents.content FROM documents
                    JOIN {{
                        LLMJoin(
                            left_on = 'w::winner',
                            right_on = 'documents::title'
                        )
                    }}
                    WHERE w.year = '1971-72'
                )
            )
        }}
        """,
    },
    {
        "serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n  "date" TEXT,\n  "language" TEXT,\n  "language family" TEXT,\n  "region" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index                     date language language family      region\n     0 early 2nd millennium bce sumerian         isolate mesopotamia\n     1       2nd millennium bce  eblaite         semitic       syria\n     2            ca . 1100 bce  hittite       anatolian    anatolia\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
        "question": "What was the language family that was used in Hattusa , as well as parts of the northern Levant and Upper Mesopotamia ?",
        "blendsql": """
        SELECT "language family" FROM w
        WHERE language = {{
            LLMQA(
                'Which language was used in Hattusa, as well as parts of the northern Levant and Upper Mesopotamia ?',
                (SELECT title, content FROM documents WHERE documents MATCH 'hattusa'),
                options='w::language'
            )
        }}
       """,
    },
]

In [16]:

Copied!





# 2) Define our prompt to the Parser LLM
class ParserProgram(Program):
    def __call__(self, model: Model, examples: List[dict], serialized_db: str, question: str, **kwargs):
        prompt = ""
        prompt += dedent("""
        Generate BlendSQL given the question, table, and passages to answer the question correctly.
        BlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.
        These external functions should be wrapped in double curly brackets.

        If question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:
            `LLMMap('question', '{table}::{column}')`

        If mapping to a new column still cannot answer the question with valid SQL, turn to an end-to-end solution using a new grammar:
            `LLMQA('{question}', ({blendsql}))`

        If we need to do a `join` operation where there is imperfect alignment between table values, use the new grammar:
            `LLMJoin(({blendsql}), options='{table}::{column}')`

        ONLY use these BlendSQL ingredients if necessary.
        Answer parts of the question in vanilla SQL, if possible.

        Examples:\n
        """)
        for example in examples:
            prompt += f"{example['serialized_db']}\n\n"
            prompt += f"Question: {example['question']}\n"
            prompt += f"BlendSQL: {example['blendsql']}\n"
        prompt += f"{serialized_db}\n\n"
        prompt += f"Question: {question}\n"
        prompt += f"BlendSQL: "
        generator = outlines.generate.text(model.model_obj)
        result = generator(prompt)
        return (result, prompt)
# 2) Define our prompt to the Parser LLM
class ParserProgram(Program):
    def __call__(self, model: Model, examples: List[dict], serialized_db: str, question: str, **kwargs):
        prompt = ""
        prompt += dedent("""
        Generate BlendSQL given the question, table, and passages to answer the question correctly.
        BlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.
        These external functions should be wrapped in double curly brackets.

        If question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:
            `LLMMap('question', '{table}::{column}')`

        If mapping to a new column still cannot answer the question with valid SQL, turn to an end-to-end solution using a new grammar:
            `LLMQA('{question}', ({blendsql}))`

        If we need to do a `join` operation where there is imperfect alignment between table values, use the new grammar:
            `LLMJoin(({blendsql}), options='{table}::{column}')`

        ONLY use these BlendSQL ingredients if necessary.
        Answer parts of the question in vanilla SQL, if possible.

        Examples:\n
        """)
        for example in examples:
            prompt += f"{example['serialized_db']}\n\n"
            prompt += f"Question: {example['question']}\n"
            prompt += f"BlendSQL: {example['blendsql']}\n"
        prompt += f"{serialized_db}\n\n"
        prompt += f"Question: {question}\n"
        prompt += f"BlendSQL: "
        generator = outlines.generate.text(model.model_obj)
        result = generator(prompt)
        return (result, prompt)

In [17]:

Copied!





def few_shot_blendsql(question: str, db: SQLite, parser: Model, blender: Model):
    # 3) Call the parser with our prompt
    predicted_query = parser.predict(
        program=ParserProgram,
        serialized_db=db.to_serialized(),
        question=question,
        examples=examples
    )
    # 4) Execute the BlendSQL query to get the final answer
    smoothie = blend(
        query=predicted_query,
        db=db,
        ingredients={LLMMap, LLMQA, LLMJoin},
        verbose=False,
        default_model=blender
    )
    return (predicted_query, smoothie)
def few_shot_blendsql(question: str, db: SQLite, parser: Model, blender: Model):
    # 3) Call the parser with our prompt
    predicted_query = parser.predict(
        program=ParserProgram,
        serialized_db=db.to_serialized(),
        question=question,
        examples=examples
    )
    # 4) Execute the BlendSQL query to get the final answer
    smoothie = blend(
        query=predicted_query,
        db=db,
        ingredients={LLMMap, LLMQA, LLMJoin},
        verbose=False,
        default_model=blender
    )
    return (predicted_query, smoothie)

In [18]:

Copied!





blendsql, smoothie = few_shot_blendsql(
    question="What team did New Zealand play in the city featuring the Mount Panorama racetrack ?",
    db=SQLite(fetch_from_hub("1884_New_Zealand_rugby_union_tour_of_New_South_Wales_1.db")),
    default_model=OpenaiLLM("gpt-3.5-turbo"),
    parser=OpenaiLLM("gpt-3.5-turbo")
)
blendsql, smoothie = few_shot_blendsql(
    question="What team did New Zealand play in the city featuring the Mount Panorama racetrack ?",
    db=SQLite(fetch_from_hub("1884_New_Zealand_rugby_union_tour_of_New_South_Wales_1.db")),
    default_model=OpenaiLLM("gpt-3.5-turbo"),
    parser=OpenaiLLM("gpt-3.5-turbo")
)

In [19]:

Copied!

print(blendsql)
print(blendsql)

SELECT rival 
FROM w 
WHERE city = {{
    LLMQA(
        'What city features the Mount Panorama racetrack?',
        (SELECT title, content FROM documents WHERE documents MATCH 'mount panorama racetrack'),
        options='w::city'
    )
}}

In [20]:

Copied!

smoothie.df
smoothie.df

Out[20]:

	rival
0	western districts