LLMJoin
Description
This ingredient handles the logic of semantic JOIN
clauses between tables.
In other words, it creates a custom mapping between a pair of value sets. Behind the scenes, this mapping is then used to create an auxiliary table to use in carrying out an INNER JOIN
.
For example:
SELECT Capitals.name, State.name FROM Capitals
JOIN {{
LLMJoin(
'Align state to capital',
left_on='States::name',
right_on='Capitals::name'
)
}}
The above example hints at a database schema that would make E.F Codd very angry: why do we have two separate tables States
and Capitals
with no foreign key to join the two?
BlendSQL was built to interact with tables "in-the-wild", and many (such as those on Wikipedia) do not have these convenient properties of well-designed relational models.
For this reason, we can leverage the internal knowledge of a pre-trained LLM to do the JOIN
operation for us.
JoinProgram
Bases: Program
Source code in blendsql/ingredients/builtin/join/main.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154 | class JoinProgram(Program):
def __call__(
self,
model: Model,
join_criteria: str,
left_values: List[str],
right_values: List[str],
sep: str,
**kwargs,
) -> Tuple[str, str]:
if isinstance(model, LocalModel):
m: guidance.models.Model = model.model_obj
with guidance.system():
m += "You are a database expert in charge of performing a modified `LEFT JOIN` operation. This `LEFT JOIN` is based on a semantic criteria given by the user."
m += f"\nThe left and right value alignment should be separated by '{sep}', with each new `JOIN` alignment goin on a newline. If a given left value has no corresponding right value, give '-' as a response."
m += newline_dedent(
"""
Criteria: Join to same topics.
Left Values:
joshua fields
bob brown
ron ryan
Right Values:
ron ryan
colby mules
bob brown (ice hockey)
josh fields (pitcher)
Output:
{
"joshua fields": "josh fields (pitcher)",
"bob brown": "bob brown (ice hockey)",
"ron ryan": "ron ryan"
}
---
"""
)
with guidance.user():
m += newline_dedent(
"""
Criteria: {}
Left Values:
{}
Right Values:
{}
Output:
""".format(
join_criteria, "\n".join(left_values), "\n".join(right_values)
)
)
prompt = m._current_prompt()
@guidance(stateless=True, dedent=False)
def make_predictions(lm, left_values, right_values):
lm += "{"
gen_f = guidance.select(options=right_values)
for idx, value in enumerate(left_values):
lm += (
f'\n\t"{value}": '
+ guidance.capture(gen_f, name=value)
+ ("," if idx + 1 != len(right_values) else "")
)
return lm
with guidance.assistant():
m += make_predictions(
left_values=left_values, right_values=right_values
)
return (m._variables, prompt)
else:
# Use 'old' style prompt for remote models
prompt = ""
prompt += "You are a database expert in charge of performing a modified `LEFT JOIN` operation. This `LEFT JOIN` is based on a semantic criteria given by the user."
prompt += f"\nThe left and right value alignment should be separated by '{sep}', with each new `JOIN` alignment goin on a newline. If a given left value has no corresponding right value, give '-' as a response."
prompt += newline_dedent(
"""
Criteria: Join to same topics.
Left Values:
joshua fields
bob brown
ron ryan
Right Values:
ron ryan
colby mules
bob brown (ice hockey)
josh fields (pitcher)
Output:
joshua fields;josh fields (pitcher)
bob brown;bob brown (ice hockey)
ron ryan;ron ryan
---
"""
)
prompt += newline_dedent(
"""
Criteria: {}
Left Values:
{}
Right Values:
{}
Output:
""".format(
join_criteria, "\n".join(left_values), "\n".join(right_values)
)
)
max_tokens = (
len(
model.tokenizer.encode(
"".join(left_values)
+ "".join(right_values)
+ (CONST.DEFAULT_ANS_SEP * len(left_values)),
)
)
if model.tokenizer is not None
else None
)
response = generate(
model, prompt=prompt, max_tokens=max_tokens, stop_at=["---"]
)
# Post-process language model response
_result = response.split("\n")
mapping: dict = {}
for item in _result:
if CONST.DEFAULT_ANS_SEP in item:
k, v = item.rsplit(CONST.DEFAULT_ANS_SEP, 1)
if any(pred == CONST.DEFAULT_NAN_ANS for pred in {k, v}):
continue
mapping[k] = v
return (mapping, prompt)
|
LLMJoin
Bases: JoinIngredient
Source code in blendsql/ingredients/builtin/join/main.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181 | class LLMJoin(JoinIngredient):
DESCRIPTION = """
If we need to do a `join` operation where there is imperfect alignment between table values, use the new function:
`{{LLMJoin(left_on='table::column', right_on='table::column')}}`
"""
def run(
self,
model: Model,
left_values: List[str],
right_values: List[str],
question: Optional[str] = None,
**kwargs,
) -> dict:
if question is None:
question = "Join to same topics."
mapping = model.predict(
program=JoinProgram,
sep=CONST.DEFAULT_ANS_SEP,
left_values=left_values,
right_values=right_values,
join_criteria=question,
**kwargs,
)
return {k: v for k, v in mapping.items() if v != CONST.DEFAULT_NAN_ANS}
|
run(model, left_values, right_values, question=None, **kwargs)
Source code in blendsql/ingredients/builtin/join/main.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181 | def run(
self,
model: Model,
left_values: List[str],
right_values: List[str],
question: Optional[str] = None,
**kwargs,
) -> dict:
if question is None:
question = "Join to same topics."
mapping = model.predict(
program=JoinProgram,
sep=CONST.DEFAULT_ANS_SEP,
left_values=left_values,
right_values=right_values,
join_criteria=question,
**kwargs,
)
return {k: v for k, v in mapping.items() if v != CONST.DEFAULT_NAN_ANS}
|