Skip to content

Structured Data Extraction

summ.structure.Structurer

Bases: Chain

Structurers infer a set of structured data to extract based on the query, then extract that data from every source document.

Source code in summ/structure/structurer.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
class Structurer(Chain):
    """Structurers infer a set of structured data to extract based on the query, then extract that data from every source document."""

    def __init__(self, query: str, **kwargs):
        super().__init__(**kwargs)
        self.query = query

    def metrics_template(self) -> PromptTemplate:
        """The template to transform a query into a list of metrics."""
        return PromptTemplate(
            template=dedent(
                f"""
                Use the query to determine which structured data is needed, and for each, write a specification which will extract and collect the data.
                If the query is qualitative, you can return an empty list.
                Your response must be in valid JSON format. Do not extract the information yet, just describe how to do so.
                The options for type are: {', '.join(list(MetricType))}.
                The options for collect are: {', '.join(list(MetricCollect))}.
                The prompt should minimize variance in the response.

                For example:
                Prompt: In each department, how many times did people prefer Google over Bing.
                Response:
                ```
                [
                    {{"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"}},
                    {{"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"}},
                ]
                ```

                Prompt: {{{{ query }}}}
                Response:
                ```
                """
            ),
            input_variables=["query"],
            template_format="jinja2",
        )

    def doc_template(self) -> PromptTemplate:
        """The template to transform a document into a list of metrics."""

        return PromptTemplate(
            template=dedent(
                """
                Your task is to take a spec describing how to extract structured data, and apply it to a document.
                You must follow the spec exactly. For example, if the spec specifies an enum, your response must be one of the options.

                For example:
                Document:
                ```
                Yea over in sales, we prefer Google. Rest of the company likes DuckDuckGo.
                ```
                Spec:
                ```
                [
                    {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                    {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
                ]
                ```
                Response:
                ```
                {"department": "Engineering", "preferred": "GOOGLE"}
                ```

                Document:
                ```
                {{ text }}
                ```
                Spec:
                ```
                {{ spec }}
                ```
                Response:
                ```
                """
            ),
            input_variables=["text", "spec"],
            template_format="jinja2",
        )

    def clean_template(self) -> PromptTemplate:
        """The template to take a set of extracted metrics and clean them up."""

        return PromptTemplate(
            template=dedent(
                """
                Your task is to take a set of extracted metrics and clean them up.
                Each metric will have a spec. You can use the spec to determine how to clean the metric.
                The goal is to minimize variance and make this data useful for aggregation.
                If elements are semantically equivalent, they should be combined.

                For example:
                Query: In each department, how many times did people prefer Google over Bing.
                Spec:
                ```
                [
                    {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                    {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
                    {"metric": "feelings", "prompt": "How did the individual feel about the search engine?", "type": "string", "collect": "list"},
                ]
                ```
                Data:
                ```
                [
                    {
                        "department": ["Engineering", "eng", "engineering", "sales", "marketing", "markting"],
                        "preferred": {"GOOGLE": 3, "BING": 1, "OTHER": 1}
                        "feelings": ["it was truly awesome", "I really liked it", "awesome for all the months I used it", "sweet and liked it quite a bit"]
                    },
                ]
                ```
                Cleaned:
                ```
                [
                    {
                        "department": ["Engineering", "Sales", "Marketing"],
                        "preferred": {"GOOGLE": 3, "BING": 1}
                        "feelings": ["Awesome", "Liked It", "Sweet"],
                    },
                ]
                ```

                Spec:
                ```
                {{ spec }}
                ```
                Data:
                ```
                {{ data }}
                ```
                Cleaned:
                ```
                """
            ),
            input_variables=["spec", "data"],
            template_format="jinja2",
        )

    @cached_property
    def metrics(self) -> list[Metric]:
        """Extract the metrics from the query"""
        chain = LLMChain(llm=self.llm, prompt=self.metrics_template())
        results = self.cached(
            "metrics",
            chain,
            Document(page_content=self.query),
            lambda d: {"query": d.page_content, "stop": "```"},
        )
        try:
            return [
                m
                for o in cast(list, dirtyjson.loads(results))
                for m in [Metric.safe_parse(o)]
                if m is not None
            ]
        except Exception as e:
            logging.info(e)
            return []

    def metric(self, metric: str) -> Metric:
        return next(m for m in self.metrics if m.metric == metric)

    @property
    def spec(self) -> str:
        return f"[{', '.join([m.json() for m in self.metrics])}]"

    def extract_metrics(self, doc: Document) -> dict[str, MetricValue]:
        """Extract the metrics from the document"""
        results = self.cached(
            "extract_metrics",
            LLMChain(llm=self.llm, prompt=self.doc_template()),
            doc,
            lambda d: {
                "text": d.page_content,
                "spec": self.spec,
                "stop": "```",
            },
        )
        try:
            metrics = {
                k: m
                for k, v in cast(dict[str, TVal], dirtyjson.loads(results)).items()
                for m in [
                    MetricValue.safe_parse({"metric": self.metric(k), "value": v})
                ]
                if m and m.value is not None
            }
        except Exception as e:
            traceback.print_exc()
            metrics = {}

        return metrics

    def clean(self, metrics: dict[str, TVal_]) -> dict[str, TVal_]:
        """Clean up a set of extracted metrics."""

        results = self.cached(
            "clean_metrics",
            LLMChain(llm=self.llm, prompt=self.clean_template()),
            cast(list[Document], []),
            lambda _: {
                "spec": self.spec,
                "data": json.dumps(metrics),
                "stop": "```",
            },
        )

        try:
            return dirtyjson.loads(results)
        except Exception as e:
            logging.info(e)
            return metrics

    def _extract(self, docs: list[Document]) -> dict[str, TVal_]:
        metrics = self._pmap(self.extract_metrics, docs)
        formatted: dict[str, TVal_] = {
            m.metric: m.collect_fn(cs)
            for m in self.metrics
            for cs in [[x for v in metrics for x in [v.get(m.metric, None)] if x]]
        }
        if cleaned := self.clean(formatted):
            for k, v in cleaned.items():
                self.dprint("Metric", k, color="yellow")
                self.dprint("", v)
        return cleaned

    def extract(self, docs: list[Document]) -> dict[str, TVal_]:
        """Extract metrics from all documents"""
        try:
            return self._extract(docs)
        except Exception as e:
            logging.info(e)
            return {}

metrics: list[Metric] cached property

Extract the metrics from the query

metrics_template() -> PromptTemplate

The template to transform a query into a list of metrics.

Source code in summ/structure/structurer.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def metrics_template(self) -> PromptTemplate:
    """The template to transform a query into a list of metrics."""
    return PromptTemplate(
        template=dedent(
            f"""
            Use the query to determine which structured data is needed, and for each, write a specification which will extract and collect the data.
            If the query is qualitative, you can return an empty list.
            Your response must be in valid JSON format. Do not extract the information yet, just describe how to do so.
            The options for type are: {', '.join(list(MetricType))}.
            The options for collect are: {', '.join(list(MetricCollect))}.
            The prompt should minimize variance in the response.

            For example:
            Prompt: In each department, how many times did people prefer Google over Bing.
            Response:
            ```
            [
                {{"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"}},
                {{"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"}},
            ]
            ```

            Prompt: {{{{ query }}}}
            Response:
            ```
            """
        ),
        input_variables=["query"],
        template_format="jinja2",
    )

doc_template() -> PromptTemplate

The template to transform a document into a list of metrics.

Source code in summ/structure/structurer.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def doc_template(self) -> PromptTemplate:
    """The template to transform a document into a list of metrics."""

    return PromptTemplate(
        template=dedent(
            """
            Your task is to take a spec describing how to extract structured data, and apply it to a document.
            You must follow the spec exactly. For example, if the spec specifies an enum, your response must be one of the options.

            For example:
            Document:
            ```
            Yea over in sales, we prefer Google. Rest of the company likes DuckDuckGo.
            ```
            Spec:
            ```
            [
                {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
            ]
            ```
            Response:
            ```
            {"department": "Engineering", "preferred": "GOOGLE"}
            ```

            Document:
            ```
            {{ text }}
            ```
            Spec:
            ```
            {{ spec }}
            ```
            Response:
            ```
            """
        ),
        input_variables=["text", "spec"],
        template_format="jinja2",
    )

clean_template() -> PromptTemplate

The template to take a set of extracted metrics and clean them up.

Source code in summ/structure/structurer.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def clean_template(self) -> PromptTemplate:
    """The template to take a set of extracted metrics and clean them up."""

    return PromptTemplate(
        template=dedent(
            """
            Your task is to take a set of extracted metrics and clean them up.
            Each metric will have a spec. You can use the spec to determine how to clean the metric.
            The goal is to minimize variance and make this data useful for aggregation.
            If elements are semantically equivalent, they should be combined.

            For example:
            Query: In each department, how many times did people prefer Google over Bing.
            Spec:
            ```
            [
                {"metric": "department", "prompt": "Extract the company department that the user of this interview works in.", "type": "string", "collect": "list"},
                {"metric": "preferred", "prompt": "Which of the following options best represents which search engine was preferred?", "type": "enum", "options": ["GOOGLE", "BING", "OTHER"], "collect": "count_unique"},
                {"metric": "feelings", "prompt": "How did the individual feel about the search engine?", "type": "string", "collect": "list"},
            ]
            ```
            Data:
            ```
            [
                {
                    "department": ["Engineering", "eng", "engineering", "sales", "marketing", "markting"],
                    "preferred": {"GOOGLE": 3, "BING": 1, "OTHER": 1}
                    "feelings": ["it was truly awesome", "I really liked it", "awesome for all the months I used it", "sweet and liked it quite a bit"]
                },
            ]
            ```
            Cleaned:
            ```
            [
                {
                    "department": ["Engineering", "Sales", "Marketing"],
                    "preferred": {"GOOGLE": 3, "BING": 1}
                    "feelings": ["Awesome", "Liked It", "Sweet"],
                },
            ]
            ```

            Spec:
            ```
            {{ spec }}
            ```
            Data:
            ```
            {{ data }}
            ```
            Cleaned:
            ```
            """
        ),
        input_variables=["spec", "data"],
        template_format="jinja2",
    )

extract_metrics(doc: Document) -> dict[str, MetricValue]

Extract the metrics from the document

Source code in summ/structure/structurer.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
def extract_metrics(self, doc: Document) -> dict[str, MetricValue]:
    """Extract the metrics from the document"""
    results = self.cached(
        "extract_metrics",
        LLMChain(llm=self.llm, prompt=self.doc_template()),
        doc,
        lambda d: {
            "text": d.page_content,
            "spec": self.spec,
            "stop": "```",
        },
    )
    try:
        metrics = {
            k: m
            for k, v in cast(dict[str, TVal], dirtyjson.loads(results)).items()
            for m in [
                MetricValue.safe_parse({"metric": self.metric(k), "value": v})
            ]
            if m and m.value is not None
        }
    except Exception as e:
        traceback.print_exc()
        metrics = {}

    return metrics

clean(metrics: dict[str, TVal_]) -> dict[str, TVal_]

Clean up a set of extracted metrics.

Source code in summ/structure/structurer.py
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def clean(self, metrics: dict[str, TVal_]) -> dict[str, TVal_]:
    """Clean up a set of extracted metrics."""

    results = self.cached(
        "clean_metrics",
        LLMChain(llm=self.llm, prompt=self.clean_template()),
        cast(list[Document], []),
        lambda _: {
            "spec": self.spec,
            "data": json.dumps(metrics),
            "stop": "```",
        },
    )

    try:
        return dirtyjson.loads(results)
    except Exception as e:
        logging.info(e)
        return metrics

extract(docs: list[Document]) -> dict[str, TVal_]

Extract metrics from all documents

Source code in summ/structure/structurer.py
292
293
294
295
296
297
298
def extract(self, docs: list[Document]) -> dict[str, TVal_]:
    """Extract metrics from all documents"""
    try:
        return self._extract(docs)
    except Exception as e:
        logging.info(e)
        return {}

summ.structure.SQLStructurer

Bases: Structurer

Constructs an in-memory SQLite database to store and query the structured data.

Source code in summ/structure/sql_structurer.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
class SQLStructurer(Structurer):
    """Constructs an in-memory SQLite database to store and query the structured data."""

    def __init__(self, query: str, **kwargs):
        super().__init__(query, **kwargs)
        self.llm.max_tokens = 1024
        self.conn = sqlite3.connect(":memory:")
        self.conn.row_factory = sqlite3.Row

    def metrics_template(self) -> PromptTemplate:
        """The template to transform a query into a list of metrics."""

        return PromptTemplate(
            template=dedent(
                f"""
                Use the query to determine which structured data is needed, and use this to create a SQL table DDL.
                Include a confidence score column with values from 0 to 100.
                If the query is qualitative, you can return an empty table.
                Your response must be valid and complete SQL.

                Prompt: {{{{ query }}}}
                DDL:
                ```
                """
            ),
            input_variables=["query"],
            template_format="jinja2",
        )

    def doc_template(self) -> PromptTemplate:
        """The template to transform a document into a list of metrics."""

        return PromptTemplate(
            template=dedent(
                """
                You will be provided with the schema for a SQL table.
                Write between zero and three SQL statements which will insert data into the table.
                You do not need to use all three statements.
                Do not insert data which is not relevant to the query. Do not insert data which is ambiguous. Do not insert data which is noisy or too long.
                Only insert data that is derived from the document provided. Do not guess or make up data.
                For each row, record your confidence that the data is relevant to the query as a number from 0 to 100, using the confidence score column.
                Your response must be valid and complete SQL.

                Query: {{ query }}
                Document:
                ```
                {{ text }}
                ```
                Schema:
                ```
                {{ schema }}
                ```
                Response:
                ```
                """
            ),
            input_variables=["query", "text", "schema"],
            template_format="jinja2",
        )

    def clean_template(self) -> PromptTemplate:
        """The template to extract the relevant rows."""

        return PromptTemplate(
            template=dedent(
                """
                Write a SQLite statement which will clean and extract rows from the table.
                Use CTEs to process and clean the data. Apply a WHERE clause to extract the relevant rows.
                The cleaning rules must be short and simple. They do not need to be comprehensive.
                Your response must be valid and complete SQLite.
                No string literal in the response may be longer than 200 characters. The response must be less than 512 tokens.
                The sample data provided is not comprehensive.

                --
                This is an example. Use it as a guide, but do not copy it.
                The content is not relevant to the task.
                Your query can take a different form.
                ```
                WITH CleanedData AS (
                SELECT
                    CASE
                    WHEN Department LIKE '%eng%' THEN 'Engineering'
                    WHEN Department = 'sales' THEN 'Sales & Marketing'
                    WHEN Department = 'marketing' THEN 'Sales & Marketing'
                    ELSE NULL
                    END AS Department,
                    CASE
                    WHEN (Answer = 'yup' OR Answer = 'yes') THEN 'Yes'
                    WHEN Answer LIKE 'no%' THEN 'No'
                    ELSE Answer
                    END AS Answer,
                    Answer
                FROM Table
                )
                SELECT
                Department,
                GROUP_CONCAT(Response, ', ') AS Responses,
                AVG(ConfidenceScore) AS AvgConfidenceScore
                FROM CleanedData
                WHERE ConfidenceScore > 80
                GROUP BY Department
                HAVING COUNT(Response) > 2
                ORDER BY AVG(ConfidenceScore) DESC
                LIMIT 10;
                ```
                --

                Now with the real input.

                Query: {{ query }}
                Schema:
                ```
                {{ schema }}
                ```
                Sample Data:
                ```
                {{ data }}
                ```
                Response:
                ```
                """
            ),
            input_variables=["query", "schema", "data"],
            template_format="jinja2",
        )

    @cached_property
    def schema(self) -> str:
        """Extract the DDL for a table from the query."""

        chain = LLMChain(llm=self.llm, prompt=self.metrics_template())
        results = self.cached(
            "sql",
            chain,
            Document(page_content=self.query),
            lambda d: {"query": d.page_content, "stop": "```"},
        )

        return results

    @cached_property
    def table_name(self) -> str:
        curr = self.conn.cursor()
        curr.execute("SELECT * FROM sqlite_schema WHERE type='table';")
        res = curr.fetchone()
        return res["name"]

    def extract_metrics(self, doc: Document):
        """Extract the metrics from the document"""
        results = self.cached(
            "extract_metrics",
            LLMChain(llm=self.llm, prompt=self.doc_template()),
            doc,
            lambda d: {
                "query": self.query,
                "text": d.page_content,
                "schema": self.schema,
                "stop": "```",
            },
        )
        return results

    def clean(self, metrics: list[sqlite3.Row]) -> list[dict]:
        """Extract cleaned metrics from all documents"""
        stmnt = self.cached(
            "clean_metrics",
            LLMChain(llm=self.llm, prompt=self.clean_template()),
            cast(list[Document], []),
            lambda _: {
                "query": self.query,
                "schema": self.schema,
                "data": "\n".join([str(tuple(dict(x).values())) for x in metrics]),
                "stop": "```",
            },
        )

        try:
            return [dict(x) for x in self.conn.cursor().execute(stmnt).fetchall()]
        except Exception as e:
            logging.info(e)
            return [dict(x) for x in metrics]

    def _extract(self, docs: list[Document]) -> dict[str, TVal_]:
        self.conn.cursor().executescript(self.schema)

        metrics = self._pmap(self.extract_metrics, docs)
        for metric in metrics:
            try:
                self.conn.cursor().executescript(metric)
            except Exception as e:
                logging.info(e)

        metrics = (
            self.conn.cursor()
            .execute(f"SELECT * FROM {self.table_name} ORDER BY RANDOM() LIMIT 50;")
            .fetchall()
        )
        if cleaned := self.clean(metrics):
            for metric in cleaned[0].keys():
                self.dprint("Metric", metric, color="yellow")
                self.dprint("", "\n--\n".join([str(x[metric]) for x in cleaned[:5]]))

        return {"data": cleaned}

schema: str cached property

Extract the DDL for a table from the query.

metrics_template() -> PromptTemplate

The template to transform a query into a list of metrics.

Source code in summ/structure/sql_structurer.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def metrics_template(self) -> PromptTemplate:
    """The template to transform a query into a list of metrics."""

    return PromptTemplate(
        template=dedent(
            f"""
            Use the query to determine which structured data is needed, and use this to create a SQL table DDL.
            Include a confidence score column with values from 0 to 100.
            If the query is qualitative, you can return an empty table.
            Your response must be valid and complete SQL.

            Prompt: {{{{ query }}}}
            DDL:
            ```
            """
        ),
        input_variables=["query"],
        template_format="jinja2",
    )

doc_template() -> PromptTemplate

The template to transform a document into a list of metrics.

Source code in summ/structure/sql_structurer.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def doc_template(self) -> PromptTemplate:
    """The template to transform a document into a list of metrics."""

    return PromptTemplate(
        template=dedent(
            """
            You will be provided with the schema for a SQL table.
            Write between zero and three SQL statements which will insert data into the table.
            You do not need to use all three statements.
            Do not insert data which is not relevant to the query. Do not insert data which is ambiguous. Do not insert data which is noisy or too long.
            Only insert data that is derived from the document provided. Do not guess or make up data.
            For each row, record your confidence that the data is relevant to the query as a number from 0 to 100, using the confidence score column.
            Your response must be valid and complete SQL.

            Query: {{ query }}
            Document:
            ```
            {{ text }}
            ```
            Schema:
            ```
            {{ schema }}
            ```
            Response:
            ```
            """
        ),
        input_variables=["query", "text", "schema"],
        template_format="jinja2",
    )

clean_template() -> PromptTemplate

The template to extract the relevant rows.

Source code in summ/structure/sql_structurer.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def clean_template(self) -> PromptTemplate:
    """The template to extract the relevant rows."""

    return PromptTemplate(
        template=dedent(
            """
            Write a SQLite statement which will clean and extract rows from the table.
            Use CTEs to process and clean the data. Apply a WHERE clause to extract the relevant rows.
            The cleaning rules must be short and simple. They do not need to be comprehensive.
            Your response must be valid and complete SQLite.
            No string literal in the response may be longer than 200 characters. The response must be less than 512 tokens.
            The sample data provided is not comprehensive.

            --
            This is an example. Use it as a guide, but do not copy it.
            The content is not relevant to the task.
            Your query can take a different form.
            ```
            WITH CleanedData AS (
            SELECT
                CASE
                WHEN Department LIKE '%eng%' THEN 'Engineering'
                WHEN Department = 'sales' THEN 'Sales & Marketing'
                WHEN Department = 'marketing' THEN 'Sales & Marketing'
                ELSE NULL
                END AS Department,
                CASE
                WHEN (Answer = 'yup' OR Answer = 'yes') THEN 'Yes'
                WHEN Answer LIKE 'no%' THEN 'No'
                ELSE Answer
                END AS Answer,
                Answer
            FROM Table
            )
            SELECT
            Department,
            GROUP_CONCAT(Response, ', ') AS Responses,
            AVG(ConfidenceScore) AS AvgConfidenceScore
            FROM CleanedData
            WHERE ConfidenceScore > 80
            GROUP BY Department
            HAVING COUNT(Response) > 2
            ORDER BY AVG(ConfidenceScore) DESC
            LIMIT 10;
            ```
            --

            Now with the real input.

            Query: {{ query }}
            Schema:
            ```
            {{ schema }}
            ```
            Sample Data:
            ```
            {{ data }}
            ```
            Response:
            ```
            """
        ),
        input_variables=["query", "schema", "data"],
        template_format="jinja2",
    )

extract_metrics(doc: Document)

Extract the metrics from the document

Source code in summ/structure/sql_structurer.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def extract_metrics(self, doc: Document):
    """Extract the metrics from the document"""
    results = self.cached(
        "extract_metrics",
        LLMChain(llm=self.llm, prompt=self.doc_template()),
        doc,
        lambda d: {
            "query": self.query,
            "text": d.page_content,
            "schema": self.schema,
            "stop": "```",
        },
    )
    return results

clean(metrics: list[sqlite3.Row]) -> list[dict]

Extract cleaned metrics from all documents

Source code in summ/structure/sql_structurer.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def clean(self, metrics: list[sqlite3.Row]) -> list[dict]:
    """Extract cleaned metrics from all documents"""
    stmnt = self.cached(
        "clean_metrics",
        LLMChain(llm=self.llm, prompt=self.clean_template()),
        cast(list[Document], []),
        lambda _: {
            "query": self.query,
            "schema": self.schema,
            "data": "\n".join([str(tuple(dict(x).values())) for x in metrics]),
            "stop": "```",
        },
    )

    try:
        return [dict(x) for x in self.conn.cursor().execute(stmnt).fetchall()]
    except Exception as e:
        logging.info(e)
        return [dict(x) for x in metrics]