Skip to content

LLMJoin

ingredients

Description

This ingredient handles the logic of semantic JOIN clauses between tables.

In other words, it creates a custom mapping between a pair of value sets. Behind the scenes, this mapping is then used to create an auxiliary table to use in carrying out an INNER JOIN.

For example:

SELECT Capitals.name, State.name FROM Capitals
    JOIN {{
        LLMJoin(
            'Align state to capital', 
            left_on='States::name', 
            right_on='Capitals::name'
        )
    }}
The above example hints at a database schema that would make E.F Codd very angry: why do we have two separate tables States and Capitals with no foreign key to join the two?

BlendSQL was built to interact with tables "in-the-wild", and many (such as those on Wikipedia) do not have these convenient properties of well-designed relational models.

For this reason, we can leverage the internal knowledge of a pre-trained LLM to do the JOIN operation for us.

JoinProgram

Bases: Program

Source code in blendsql/ingredients/builtin/join/main.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
class JoinProgram(Program):
    def __call__(
        self,
        model: Model,
        join_criteria: str,
        left_values: List[str],
        right_values: List[str],
        sep: str,
        **kwargs,
    ) -> Tuple[str, str]:
        if isinstance(model, LocalModel):
            m: guidance.models.Model = model.model_obj
            with guidance.system():
                m += "You are a database expert in charge of performing a modified `LEFT JOIN` operation. This `LEFT JOIN` is based on a semantic criteria given by the user."
                m += f"\nThe left and right value alignment should be separated by '{sep}', with each new `JOIN` alignment goin on a newline. If a given left value has no corresponding right value, give '-' as a response."
                m += newline_dedent(
                    """
                Criteria: Join to same topics.

                Left Values:
                joshua fields
                bob brown
                ron ryan

                Right Values:
                ron ryan
                colby mules
                bob brown (ice hockey)
                josh fields (pitcher)

                Output:
                {
                    "joshua fields": "josh fields (pitcher)",
                    "bob brown": "bob brown (ice hockey)",
                    "ron ryan": "ron ryan"
                }

                ---
                """
                )
            with guidance.user():
                m += newline_dedent(
                    """
                    Criteria: {}

                    Left Values:
                    {}

                    Right Values:
                    {}

                    Output:

                    """.format(
                        join_criteria, "\n".join(left_values), "\n".join(right_values)
                    )
                )
            prompt = m._current_prompt()

            @guidance(stateless=True, dedent=False)
            def make_predictions(lm, left_values, right_values):
                lm += "{"
                gen_f = guidance.select(options=right_values)
                for idx, value in enumerate(left_values):
                    lm += (
                        f'\n\t"{value}": '
                        + guidance.capture(gen_f, name=value)
                        + ("," if idx + 1 != len(right_values) else "")
                    )
                return lm

            with guidance.assistant():
                m += make_predictions(
                    left_values=left_values, right_values=right_values
                )
            return (m._variables, prompt)
        else:
            # Use 'old' style prompt for remote models
            prompt = ""
            prompt += "You are a database expert in charge of performing a modified `LEFT JOIN` operation. This `LEFT JOIN` is based on a semantic criteria given by the user."
            prompt += f"\nThe left and right value alignment should be separated by '{sep}', with each new `JOIN` alignment goin on a newline. If a given left value has no corresponding right value, give '-' as a response."
            prompt += newline_dedent(
                """
            Criteria: Join to same topics.

            Left Values:
            joshua fields
            bob brown
            ron ryan

            Right Values:
            ron ryan
            colby mules
            bob brown (ice hockey)
            josh fields (pitcher)

            Output:
            joshua fields;josh fields (pitcher)
            bob brown;bob brown (ice hockey)
            ron ryan;ron ryan

            ---
            """
            )
            prompt += newline_dedent(
                """
                Criteria: {}

                Left Values:
                {}

                Right Values:
                {}

                Output:
                """.format(
                    join_criteria, "\n".join(left_values), "\n".join(right_values)
                )
            )
            max_tokens = (
                len(
                    model.tokenizer.encode(
                        "".join(left_values)
                        + "".join(right_values)
                        + (CONST.DEFAULT_ANS_SEP * len(left_values)),
                    )
                )
                if model.tokenizer is not None
                else None
            )
            response = generate(
                model, prompt=prompt, max_tokens=max_tokens, stop_at=["---"]
            )
            # Post-process language model response
            _result = response.split("\n")
            mapping: dict = {}
            for item in _result:
                if CONST.DEFAULT_ANS_SEP in item:
                    k, v = item.rsplit(CONST.DEFAULT_ANS_SEP, 1)
                    if any(pred == CONST.DEFAULT_NAN_ANS for pred in {k, v}):
                        continue
                    mapping[k] = v
            return (mapping, prompt)

LLMJoin

Bases: JoinIngredient

Source code in blendsql/ingredients/builtin/join/main.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
class LLMJoin(JoinIngredient):
    DESCRIPTION = """
    If we need to do a `join` operation where there is imperfect alignment between table values, use the new function:
        `{{LLMJoin(left_on='table::column', right_on='table::column')}}`
    """

    def run(
        self,
        model: Model,
        left_values: List[str],
        right_values: List[str],
        question: Optional[str] = None,
        **kwargs,
    ) -> dict:
        if question is None:
            question = "Join to same topics."
        mapping = model.predict(
            program=JoinProgram,
            sep=CONST.DEFAULT_ANS_SEP,
            left_values=left_values,
            right_values=right_values,
            join_criteria=question,
            **kwargs,
        )
        return {k: v for k, v in mapping.items() if v != CONST.DEFAULT_NAN_ANS}

run(model, left_values, right_values, question=None, **kwargs)

Source code in blendsql/ingredients/builtin/join/main.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def run(
    self,
    model: Model,
    left_values: List[str],
    right_values: List[str],
    question: Optional[str] = None,
    **kwargs,
) -> dict:
    if question is None:
        question = "Join to same topics."
    mapping = model.predict(
        program=JoinProgram,
        sep=CONST.DEFAULT_ANS_SEP,
        left_values=left_values,
        right_values=right_values,
        join_criteria=question,
        **kwargs,
    )
    return {k: v for k, v in mapping.items() if v != CONST.DEFAULT_NAN_ANS}