Teaching BlendSQL via In-Context Learning¶
As described in our paper, the real power of BlendSQL comes when it is used as an intermediate representation for tasks requiring complex reasoning across many different forms of data.
In this notebook, we show an example of how we can 'teach' an instruction-finetuned language model how to write with this new dialect of SQL. Our pipeline can be summarized as:
- Define few-shot examples, using our dataset
- Design a prompt for our Parser LLM, which explains the task we want it to achieve
- Call our Parser with our prompt + a question to get a BlendSQL query
- Execute the BlendSQL query with
blend()
to retrieve the final answer
In [14]:
Copied!
from typing import List
from textwrap import dedent
import outlines
from blendsql import blend
from blendsql.ingredients import LLMMap, LLMJoin, LLMQA
from blendsql.models import OpenaiLLM
from blendsql.models._model import Model
from blendsql._program import Program
from blendsql.db import SQLite
from blendsql.utils import fetch_from_hub
from typing import List
from textwrap import dedent
import outlines
from blendsql import blend
from blendsql.ingredients import LLMMap, LLMJoin, LLMQA
from blendsql.models import OpenaiLLM
from blendsql.models._model import Model
from blendsql._program import Program
from blendsql.db import SQLite
from blendsql.utils import fetch_from_hub
In [15]:
Copied!
# 1) Define our few-shot examples
examples = [
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "name" TEXT,\n "province" TEXT,\n "city" TEXT,\n "year" TEXT,\n "remarks" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index name province city year remarks\n 0 abdul rahman mosque kabul province kabul 2009 largest mosque in afghanistan\n 1 friday mosque of kandahar kandahar province kandahar 1750 houses the cloak of the islamic prophet muhammad\n 2 omar al-farooq mosque kandahar province kandahar 2014 built on the site that was a popular cinema of kandahar . [ 1 ]\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "Who were the builders of the mosque in Herat with fire temples ?",
"blendsql": """
{{
LLMQA(
'Who were the builders of the mosque?',
(
SELECT documents.title AS 'Building', documents.content FROM documents
JOIN {{
LLMJoin(
left_on='w::name',
right_on='documents::title'
)
}}
WHERE w.city = 'herat' AND w.remarks LIKE '%fire temple%'
)
)
}}
""",
},
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "no" INTEGER,\n "rider" TEXT,\n "team" TEXT,\n "motorcycle" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index no rider team motorcycle\n 0 1 carl fogarty ducati performance ducati 996\n 1 4 akira yanagawa kawasaki racing team kawasaki zx-7rr\n 2 5 colin edwards castrol honda honda rc45\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "After what season did the number 7 competitor retire ?",
"blendsql": """
{{
LLMQA(
'When did the competitor retire?',
(
SELECT documents.title AS 'Competitor', documents.content FROM documents
JOIN {{
LLMJoin(
left_on='w::rider',
right_on='documents::title'
)
}}
WHERE w.no = 7
)
)
}}
""",
},
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "year" TEXT,\n "winner" TEXT,\n "position" TEXT,\n "school" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index year winner position school\n 0 1961-62 ron ryan right wing colby\n 1 1962-63 bob brinkworth center rensselaer\n 2 1963-64 bob brinkworth center rensselaer\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "What year was the 1971-72 ECAC Hockey Player of the Year born ?",
"blendsql": """
{{
LLMQA(
'What year was the player born?',
(
SELECT documents.title AS 'Player', documents.content FROM documents
JOIN {{
LLMJoin(
left_on = 'w::winner',
right_on = 'documents::title'
)
}}
WHERE w.year = '1971-72'
)
)
}}
""",
},
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "date" TEXT,\n "language" TEXT,\n "language family" TEXT,\n "region" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index date language language family region\n 0 early 2nd millennium bce sumerian isolate mesopotamia\n 1 2nd millennium bce eblaite semitic syria\n 2 ca . 1100 bce hittite anatolian anatolia\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "What was the language family that was used in Hattusa , as well as parts of the northern Levant and Upper Mesopotamia ?",
"blendsql": """
SELECT "language family" FROM w
WHERE language = {{
LLMQA(
'Which language was used in Hattusa, as well as parts of the northern Levant and Upper Mesopotamia ?',
(SELECT title, content FROM documents WHERE documents MATCH 'hattusa'),
options='w::language'
)
}}
""",
},
]
# 1) Define our few-shot examples
examples = [
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "name" TEXT,\n "province" TEXT,\n "city" TEXT,\n "year" TEXT,\n "remarks" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index name province city year remarks\n 0 abdul rahman mosque kabul province kabul 2009 largest mosque in afghanistan\n 1 friday mosque of kandahar kandahar province kandahar 1750 houses the cloak of the islamic prophet muhammad\n 2 omar al-farooq mosque kandahar province kandahar 2014 built on the site that was a popular cinema of kandahar . [ 1 ]\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "Who were the builders of the mosque in Herat with fire temples ?",
"blendsql": """
{{
LLMQA(
'Who were the builders of the mosque?',
(
SELECT documents.title AS 'Building', documents.content FROM documents
JOIN {{
LLMJoin(
left_on='w::name',
right_on='documents::title'
)
}}
WHERE w.city = 'herat' AND w.remarks LIKE '%fire temple%'
)
)
}}
""",
},
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "no" INTEGER,\n "rider" TEXT,\n "team" TEXT,\n "motorcycle" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index no rider team motorcycle\n 0 1 carl fogarty ducati performance ducati 996\n 1 4 akira yanagawa kawasaki racing team kawasaki zx-7rr\n 2 5 colin edwards castrol honda honda rc45\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "After what season did the number 7 competitor retire ?",
"blendsql": """
{{
LLMQA(
'When did the competitor retire?',
(
SELECT documents.title AS 'Competitor', documents.content FROM documents
JOIN {{
LLMJoin(
left_on='w::rider',
right_on='documents::title'
)
}}
WHERE w.no = 7
)
)
}}
""",
},
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "year" TEXT,\n "winner" TEXT,\n "position" TEXT,\n "school" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index year winner position school\n 0 1961-62 ron ryan right wing colby\n 1 1962-63 bob brinkworth center rensselaer\n 2 1963-64 bob brinkworth center rensselaer\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "What year was the 1971-72 ECAC Hockey Player of the Year born ?",
"blendsql": """
{{
LLMQA(
'What year was the player born?',
(
SELECT documents.title AS 'Player', documents.content FROM documents
JOIN {{
LLMJoin(
left_on = 'w::winner',
right_on = 'documents::title'
)
}}
WHERE w.year = '1971-72'
)
)
}}
""",
},
{
"serialized_db": 'CREATE TABLE "w" (\n"index" INTEGER,\n "date" TEXT,\n "language" TEXT,\n "language family" TEXT,\n "region" TEXT\n)\n/*\n3 example rows:\nSELECT * FROM w LIMIT 3\n index date language language family region\n 0 early 2nd millennium bce sumerian isolate mesopotamia\n 1 2nd millennium bce eblaite semitic syria\n 2 ca . 1100 bce hittite anatolian anatolia\n*/\n\nCREATE VIRTUAL TABLE "documents" USING fts5(title, content, tokenize = \'trigram\')',
"question": "What was the language family that was used in Hattusa , as well as parts of the northern Levant and Upper Mesopotamia ?",
"blendsql": """
SELECT "language family" FROM w
WHERE language = {{
LLMQA(
'Which language was used in Hattusa, as well as parts of the northern Levant and Upper Mesopotamia ?',
(SELECT title, content FROM documents WHERE documents MATCH 'hattusa'),
options='w::language'
)
}}
""",
},
]
In [16]:
Copied!
# 2) Define our prompt to the Parser LLM
class ParserProgram(Program):
def __call__(self, model: Model, examples: List[dict], serialized_db: str, question: str, **kwargs):
prompt = ""
prompt += dedent("""
Generate BlendSQL given the question, table, and passages to answer the question correctly.
BlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.
These external functions should be wrapped in double curly brackets.
If question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:
`LLMMap('question', '{table}::{column}')`
If mapping to a new column still cannot answer the question with valid SQL, turn to an end-to-end solution using a new grammar:
`LLMQA('{question}', ({blendsql}))`
If we need to do a `join` operation where there is imperfect alignment between table values, use the new grammar:
`LLMJoin(({blendsql}), options='{table}::{column}')`
ONLY use these BlendSQL ingredients if necessary.
Answer parts of the question in vanilla SQL, if possible.
Examples:\n
""")
for example in examples:
prompt += f"{example['serialized_db']}\n\n"
prompt += f"Question: {example['question']}\n"
prompt += f"BlendSQL: {example['blendsql']}\n"
prompt += f"{serialized_db}\n\n"
prompt += f"Question: {question}\n"
prompt += f"BlendSQL: "
generator = outlines.generate.text(model.model_obj)
result = generator(prompt)
return (result, prompt)
# 2) Define our prompt to the Parser LLM
class ParserProgram(Program):
def __call__(self, model: Model, examples: List[dict], serialized_db: str, question: str, **kwargs):
prompt = ""
prompt += dedent("""
Generate BlendSQL given the question, table, and passages to answer the question correctly.
BlendSQL is a superset of SQLite, which adds external function calls for information not found within native SQLite.
These external functions should be wrapped in double curly brackets.
If question-relevant column(s) contents are not suitable for SQL comparisons or calculations, map it to a new column with clean content by a new grammar:
`LLMMap('question', '{table}::{column}')`
If mapping to a new column still cannot answer the question with valid SQL, turn to an end-to-end solution using a new grammar:
`LLMQA('{question}', ({blendsql}))`
If we need to do a `join` operation where there is imperfect alignment between table values, use the new grammar:
`LLMJoin(({blendsql}), options='{table}::{column}')`
ONLY use these BlendSQL ingredients if necessary.
Answer parts of the question in vanilla SQL, if possible.
Examples:\n
""")
for example in examples:
prompt += f"{example['serialized_db']}\n\n"
prompt += f"Question: {example['question']}\n"
prompt += f"BlendSQL: {example['blendsql']}\n"
prompt += f"{serialized_db}\n\n"
prompt += f"Question: {question}\n"
prompt += f"BlendSQL: "
generator = outlines.generate.text(model.model_obj)
result = generator(prompt)
return (result, prompt)
In [17]:
Copied!
def few_shot_blendsql(question: str, db: SQLite, parser: Model, blender: Model):
# 3) Call the parser with our prompt
predicted_query = parser.predict(
program=ParserProgram,
serialized_db=db.to_serialized(),
question=question,
examples=examples
)
# 4) Execute the BlendSQL query to get the final answer
smoothie = blend(
query=predicted_query,
db=db,
ingredients={LLMMap, LLMQA, LLMJoin},
verbose=False,
default_model=blender
)
return (predicted_query, smoothie)
def few_shot_blendsql(question: str, db: SQLite, parser: Model, blender: Model):
# 3) Call the parser with our prompt
predicted_query = parser.predict(
program=ParserProgram,
serialized_db=db.to_serialized(),
question=question,
examples=examples
)
# 4) Execute the BlendSQL query to get the final answer
smoothie = blend(
query=predicted_query,
db=db,
ingredients={LLMMap, LLMQA, LLMJoin},
verbose=False,
default_model=blender
)
return (predicted_query, smoothie)
In [18]:
Copied!
blendsql, smoothie = few_shot_blendsql(
question="What team did New Zealand play in the city featuring the Mount Panorama racetrack ?",
db=SQLite(fetch_from_hub("1884_New_Zealand_rugby_union_tour_of_New_South_Wales_1.db")),
default_model=OpenaiLLM("gpt-3.5-turbo"),
parser=OpenaiLLM("gpt-3.5-turbo")
)
blendsql, smoothie = few_shot_blendsql(
question="What team did New Zealand play in the city featuring the Mount Panorama racetrack ?",
db=SQLite(fetch_from_hub("1884_New_Zealand_rugby_union_tour_of_New_South_Wales_1.db")),
default_model=OpenaiLLM("gpt-3.5-turbo"),
parser=OpenaiLLM("gpt-3.5-turbo")
)
In [19]:
Copied!
print(blendsql)
print(blendsql)
SELECT rival FROM w WHERE city = {{ LLMQA( 'What city features the Mount Panorama racetrack?', (SELECT title, content FROM documents WHERE documents MATCH 'mount panorama racetrack'), options='w::city' ) }}
In [20]:
Copied!
smoothie.df
smoothie.df
Out[20]:
rival | |
---|---|
0 | western districts |